import sqlite3 import re from typing import List, Tuple, Dict from dataclasses import dataclass @dataclass class TextUnit: book_id: str chapter_id: str text: str def parse_file(filename: str) -> List[TextUnit]: """Parse the file and return a list of TextUnits.""" units = [] current_book = "" current_chapter = "" current_text = [] book_pattern = re.compile(r'') chapter_pattern = re.compile(r'') end_pattern = re.compile(r"") with open(filename, "r", encoding="utf-8") as f: for line in f: # parse BOOK opening tag book_match = book_pattern.match(line) if book_match: current_book = book_match.group(1) continue # parse CHAPTER opening tag chapter_match = chapter_pattern.match(line) if chapter_match: current_chapter = chapter_match.group(1) current_text = [] continue # on any end tag, save the current chapter if end_pattern.match(line): if current_text: units.append( TextUnit( book_id=current_book, chapter_id=current_chapter, text="".join(current_text), ) ) continue # if line doesn't match any of our known tags, it's content if not book_pattern.match(line) and not chapter_pattern.match(line): current_text.append(line) return units def create_database(db_name: str = "parallel_texts.db"): """create schema""" conn = sqlite3.connect(db_name) with open("schema.sql", "r") as f: conn.executescript(f.read()) conn.commit() return conn def import_texts( en_units: List[TextUnit], zh_units: List[TextUnit], conn: sqlite3.Connection ): """import parsed text""" c = conn.cursor() # collect all unique book IDs book_ids = set(unit.book_id for unit in en_units) # insert books for book_id in book_ids: c.execute("insert or ignore into books (book_id) values (?)", (book_id,)) # create a dict for Chinese texts zh_dict = {(unit.book_id, unit.chapter_id): unit.text for unit in zh_units} # insert chapters with parallel texts for en_unit in en_units: zh_text = zh_dict.get((en_unit.book_id, en_unit.chapter_id), "") c.execute( """ insert or replace into chapters (book_id, chapter_id, text_en, text_zh) values (?, ?, ?, ?) """, (en_unit.book_id, en_unit.chapter_id, en_unit.text, zh_text), ) conn.commit() def main(): en_units = parse_file("train.en") zh_units = parse_file("train.zh") # create and populate database conn = create_database() import_texts(en_units, zh_units, conn) # stats c = conn.cursor() c.execute("select count(*) from books") book_count = c.fetchone()[0] c.execute("select count(*) from chapters") chapter_count = c.fetchone()[0] print(f"Imported {book_count} books and {chapter_count} chapters.") conn.close() if __name__ == "__main__": main()