commit d060cdba14a8e336f198a1658a73fc89f9851c21 Author: kuwoyuki Date: Sun Feb 9 03:07:07 2025 +0600 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..38f7f4b --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +epubs/ +*.db +train.en +train.zh diff --git a/README.md b/README.md new file mode 100644 index 0000000..eb88220 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +# chinese -> english finetuning datasets + +train.en and train.zh are from [here](https://www.dropbox.com/scl/fo/dtrf3pe1vfbo5nse16648/ANLqlv3ascANpkdnYF_w4Jk/V1/TRAIN?dl=0&rlkey=486vbn17qra1ez91btj0n4xu2&subfolder_nav_tracking=1) +TODO: mirror diff --git a/paragraph_split.py b/paragraph_split.py new file mode 100644 index 0000000..be21362 --- /dev/null +++ b/paragraph_split.py @@ -0,0 +1,389 @@ +import sqlite3 +import os +import re +from bs4 import BeautifulSoup +from pathlib import Path +import unicodedata + + +def create_paragraphs_table(conn): + """Create the paragraphs table with necessary columns and constraints.""" + conn.execute( + """ + create table if not exists paragraphs ( + id integer primary key autoincrement, + book_id text not null, + chapter_id text not null, + text_en text, + text_zh text, + char_count integer, + foreign key (book_id, chapter_id) references chapters(book_id, chapter_id) + ) + """ + ) + + +def normalize_quotes(text): + # normalize unicode characters to their composed form + text = unicodedata.normalize("NFKC", text) + + quote_map = { + "\u201c": '"', # LEFT DOUBLE QUOTATION MARK + "\u201d": '"', # RIGHT DOUBLE QUOTATION MARK + "\u2018": "'", # LEFT SINGLE QUOTATION MARK + "\u2019": "'", # RIGHT SINGLE QUOTATION MARK + "\u00ab": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + "\u00bb": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + "\u2039": "'", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK + "\u203a": "'", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + "\u2032": "'", # PRIME + "\u2033": '"', # DOUBLE PRIME + } + + for old, new in quote_map.items(): + text = text.replace(old, new) + + return text + + +def strip_paragraph_markers(text): + """remove p markers like #<# and #>#""" + return re.sub(r"#<#|#>#", "", text).strip() + + +def normalize_text(text): + """text normalziations""" + text = normalize_quotes(text) + text = strip_paragraph_markers(text) + return text + + +def get_paragraphs_from_xhtml(xhtml_path): + """extract p from html""" + with open(xhtml_path, "r", encoding="utf-8") as f: + content = f.read() + + soup = BeautifulSoup(content, "html.parser") + paragraphs = [] + + p_elements = soup.find_all("p") + + for p in p_elements: + text = normalize_text(p.get_text()) + if text: # only add non-empty paragraphs + paragraphs.append(text) + + # try br split + if not paragraphs: + content = re.sub(r"", "
", content, flags=re.IGNORECASE) + parts = re.split(r"
\s*
", content, flags=re.IGNORECASE) + + for part in parts: + clean_text = BeautifulSoup(part, "html.parser").get_text() + text = normalize_text(clean_text) + if text: # only add non-empty paragraphs + paragraphs.append(text) + + return paragraphs + + +def get_zh_text_for_lines(zh_lines, start_idx, end_idx): + """Get corresponding Chinese text for given line range.""" + return " ".join(zh_lines[start_idx : end_idx + 1]) + + +def extract_paragraphs(text_en, text_zh): + """ + Extract matching paragraphs from English and Chinese texts. + Returns list of paragraphs with normalized text. + """ + paragraphs = [] + current_en_lines = [] + current_en_indices = [] + + # split into lines and normalize + en_lines = [line.strip() for line in text_en.split("\n")] + zh_lines = [line.strip() for line in text_zh.split("\n")] if text_zh else [] + + i = 0 + while i < len(en_lines): + line = en_lines[i] + normalized_line = normalize_text(line) + + if not normalized_line: + i += 1 + continue + + current_en_lines.append(normalized_line) + current_en_indices.append(i) + + # Look ahead to check if next line is empty or ends the paragraph + next_idx = i + 1 + while next_idx < len(en_lines): + next_line = en_lines[next_idx].strip() + if not next_line: + # Empty line - continue current paragraph + next_idx += 1 + continue + + # If we have Chinese text, check if these lines correspond to a complete thought + if zh_lines: + zh_text = get_zh_text_for_lines( + zh_lines, current_en_indices[0], next_idx - 1 + ) + if zh_text: + # Found corresponding Chinese text - end paragraph + break + + # Add next line to current paragraph + normalized_next = normalize_text(next_line) + current_en_lines.append(normalized_next) + current_en_indices.append(next_idx) + next_idx += 1 + + # Create paragraph + if current_en_lines: + en_text = " ".join(current_en_lines) + zh_text = "" + if zh_lines: + zh_text = get_zh_text_for_lines( + zh_lines, current_en_indices[0], current_en_indices[-1] + ) + + paragraphs.append( + {"text_en": en_text, "text_zh": zh_text, "char_count": len(en_text)} + ) + + # Reset for next paragraph + current_en_lines = [] + current_en_indices = [] + i = next_idx + + return paragraphs + + +def match_paragraphs(xhtml_paragraphs, db_lines, lines_to_try=3): + """ + Match paragraphs from XHTML with lines from database. + Tries first few lines at start before giving up, to handle chapter titles and initial dialog. + + Args: + xhtml_paragraphs: List of XHTML paragraph texts + db_lines: List of database text lines + lines_to_try: Number of initial lines to try before giving up + + Returns: + List of tuples containing (start_idx, end_idx) for matched paragraphs + """ + + def find_next_content_line(current_idx): + """Find next non-empty line and return its index and content.""" + while current_idx < len(db_lines): + line = normalize_text(db_lines[current_idx].strip()) + if line: + return current_idx, line + current_idx += 1 + return current_idx, None + + matched_indices = [] + xhtml_idx = 0 + db_idx = 0 + tried_lines = 0 + + while xhtml_idx < len(xhtml_paragraphs) and db_idx < len(db_lines): + # find next non-empty line in db + db_check_idx, db_line = find_next_content_line(db_idx) + if not db_line: + break + + # search for p containing this line + while ( + xhtml_idx < len(xhtml_paragraphs) + and db_line not in xhtml_paragraphs[xhtml_idx] + ): + xhtml_idx += 1 + + # try ~3 db_lines at start + if xhtml_idx >= len(xhtml_paragraphs): + if not matched_indices and tried_lines < lines_to_try: + tried_lines += 1 + xhtml_idx = 0 + db_idx = db_check_idx + 1 + continue + break + + # collect all database lines that belong to this p + start_idx = db_check_idx + current_idx = db_check_idx + + while current_idx < len(db_lines): + current_line = normalize_text(db_lines[current_idx].strip()) + if current_line and current_line not in xhtml_paragraphs[xhtml_idx]: + break + current_idx += 1 + + matched_indices.append((start_idx, current_idx - 1)) + db_idx = current_idx + xhtml_idx += 1 + + return matched_indices + + +def normalize_chapter_id(chapter_id): + """ + Normalize chapter IDs by removing padding and handling special cases. + Examples: + - gfyxjdcz!_0001 -> 1 + - 00001-1-Swindler -> 1> + - wyctUp_0001 -> 1 + - ltzz_0002 -> 2 + """ + # handle IDs with _ + if "_" in chapter_id: + chapter_id = chapter_id.split("_")[-1] + + # rm any non-digit prefix and suffix + digits = re.search(r"(\d+)", chapter_id) + if digits: + chapter_id = digits.group(1) + + # rm leading zeros + return str(int(chapter_id)) + + +def find_chapter_file(epub_dir, normalized_id): + epub_dir = Path(epub_dir) + search_dirs = [ + epub_dir / "OEBPS" / "Text", + epub_dir / "OEBPS", + ] + + for directory in search_dirs: + if not directory.exists(): + continue + + for file_path in directory.glob("*.*html"): + numbers = re.findall(r"\d+", file_path.stem) + if numbers: + file_chapter_num = str(int(numbers[0])) + if file_chapter_num == normalized_id: + return file_path + + return None + + +def preserve_lines(text): + return [line.strip() if line.strip() else line for line in text.split("\n")] + + +def print_matched_paragraphs(text_en_lines, text_zh_lines, matched_indices): + """ + Print matched paragraphs from English and Chinese text, with Chinese translation + immediately following each English paragraph. + + Args: + text_en_lines (list): List of English text lines + text_zh_lines (list): List of Chinese text lines + matched_indices (list): List of tuples containing (start_idx, end_idx) + """ + if not matched_indices: + print("No matched paragraphs found.") + return + + for start_idx, end_idx in matched_indices: + # Get and join English lines for this range + en_para = " ".join(text_en_lines[start_idx : end_idx + 1]) + # Get and join Chinese lines for the same range + zh_para = " ".join(text_zh_lines[start_idx : end_idx + 1]) + + # Print English followed by Chinese + print(strip_paragraph_markers(en_para)) + print(strip_paragraph_markers(zh_para)) + print() # Extra newline between pairs + + +def process_book(conn, epub_base_dir, book_id): + """Process an entire book and add paragraphs to database.""" + epub_dir = Path(epub_base_dir) / book_id + + if not epub_dir.exists(): + # print(f"Warning: EPUB directory not found for book {book_id}: {epub_dir}") + return + + print(f"Processing book {book_id} from: {epub_dir}") + + # Get all chapters for this book + chapters = conn.execute( + "select chapter_id, text_en, text_zh from chapters where book_id = ?", + (book_id,), + ).fetchall() + + print(f"Chapter count: {len(chapters)}") + + for chapter_id, text_en, text_zh in chapters: + if not text_en or not text_zh: + print( + f"Warning: Missing content for chapter {chapter_id} in book {book_id}" + ) + continue + + # find html file + normalized_id = normalize_chapter_id(chapter_id) + xhtml_path = find_chapter_file(epub_dir, normalized_id) + if not xhtml_path: + print( + f"Warning: Could not find XHTML file for chapter {chapter_id}. normalized_id: {normalized_id}, xhtml_path: {xhtml_path}" + ) + continue + + # extract p from html + xhtml_paragraphs = get_paragraphs_from_xhtml(xhtml_path) + + # split by \n only, strip only non-empty lines + text_en_lines = preserve_lines(text_en) + text_zh_lines = preserve_lines(text_zh) + + # match ps between XHTML and db content + matched_indices = match_paragraphs(xhtml_paragraphs, text_en_lines) + + # print_matched_paragraphs(text_en_lines, text_zh_lines, matched_indices) + matched_pairs = [] + for start_idx, end_idx in matched_indices: + en_para = strip_paragraph_markers( + " ".join(text_en_lines[start_idx : end_idx + 1]) + ) + zh_para = strip_paragraph_markers( + " ".join(text_zh_lines[start_idx : end_idx + 1]) + ) + matched_pairs.append((en_para, zh_para)) + + for en_para, zh_para in matched_pairs: + conn.execute( + """ + INSERT INTO paragraphs (book_id, chapter_id, text_en, text_zh, char_count) + VALUES (?, ?, ?, ?, ?) + """, + (book_id, chapter_id, en_para, zh_para, len(en_para)), + ) + + conn.commit() + + +def process_all_books(db_path, epub_base_dir): + """Process all books in the database.""" + conn = sqlite3.connect(db_path) + create_paragraphs_table(conn) + + books = conn.execute("select book_id from books").fetchall() + + for (book_id,) in books: + process_book(conn, epub_base_dir, book_id) + + conn.close() + + +if __name__ == "__main__": + db_path = "parallel_texts.db" + epub_base_dir = "epubs" # base dir + + process_all_books(db_path, epub_base_dir) diff --git a/paragraph_split_custom_zh.py b/paragraph_split_custom_zh.py new file mode 100644 index 0000000..d22d3fa --- /dev/null +++ b/paragraph_split_custom_zh.py @@ -0,0 +1,198 @@ +import os +from bs4 import BeautifulSoup +import re +import sqlite3 +from pathlib import Path +from typing import List, Tuple, Dict + + +def clean_text(text: str) -> str: + """normalize whitespace and line end""" + return re.sub(r"\s+", " ", text).strip() + + +def extract_zh_paragraphs(soup: BeautifulSoup) -> List[str]: + """extract paragraphs from zh HTML""" + if h1_tag := soup.find("h1"): + h1_tag.decompose() + + for br in soup.find_all("br"): + br.replace_with("\n") + + content = soup.body.get_text() + paragraphs = [clean_text(p) for p in content.split("\n\n") if clean_text(p)] + + return paragraphs + + +def extract_en_paragraphs(soup: BeautifulSoup) -> List[str]: + """etract paragraphs from English HTML""" + if h1_tag := soup.find("h1"): + h1_tag.decompose() + + for footnote in soup.find_all("span", id=re.compile(r"easy-footnote.*")): + footnote.decompose() + + paragraphs = [ + clean_text(p.get_text()) for p in soup.find_all("p") if clean_text(p.get_text()) + ] + + return paragraphs + + +def print_debug_info( + chapter_num: str, + en_file: Path, + zh_file: Path, + en_paragraphs: List[str], + zh_paragraphs: List[str], +): + """debug""" + print(f"\n=== MISMATCH DETECTED IN CHAPTER {chapter_num} ===") + print(f"English file: {en_file}") + print(f"Chinese file: {zh_file}") + print(f"\nParagraph count:") + print(f" English: {len(en_paragraphs)}") + print(f" Chinese: {len(zh_paragraphs)}") + + print("\nFirst 3 English p:") + for i, p in enumerate(en_paragraphs[:3]): + print(f" {i+1}: {p[:100]}...") + + print("\nFirst 3 Chinese p:") + for i, p in enumerate(zh_paragraphs[:3]): + print(f" {i+1}: {p[:100]}...") + + print("\nRaw Chinese HTML:") + with open(zh_file, "r", encoding="utf-8") as f: + content = f.read() + print(content[:500]) + + +def process_chapter_pair(en_path: Path, zh_path: Path) -> Tuple[List[str], List[str]]: + """Process a pair of corresponding chapter files""" + with open(en_path, "r", encoding="utf-8") as f: + en_soup = BeautifulSoup(f, "html.parser") + en_paragraphs = extract_en_paragraphs(en_soup) + + with open(zh_path, "r", encoding="utf-8") as f: + zh_soup = BeautifulSoup(f, "html.parser") + zh_paragraphs = extract_zh_paragraphs(zh_soup) + + return en_paragraphs, zh_paragraphs + + +def insert_book_chapters( + db_path: str, book_id: str, matched_chapters: Dict[str, Tuple[List[str], List[str]]] +): + """ + Insert chapters and paragraphs into the database for a given book_id. + Only inserts when English and Chinese paragraph counts match. + """ + conn = sqlite3.connect(db_path) + cur = conn.cursor() + + try: + cur.execute("insert or ignore into books (book_id) values (?)", (book_id,)) + + for chapter_id, (en_paragraphs, zh_paragraphs) in matched_chapters.items(): + # only process if paragraph counts match + if len(en_paragraphs) != len(zh_paragraphs): + print( + f"Skipping chapter {chapter_id} due to paragraph count mismatch: " + f"EN: {len(en_paragraphs)}, ZH: {len(zh_paragraphs)}" + ) + continue + + # join paragraphs for chapter text + chapter_text_en = "\n".join(en_paragraphs) + chapter_text_zh = "\n".join(zh_paragraphs) + + cur.execute( + """ + insert into chapters (book_id, chapter_id, text_en, text_zh) + values (?, ?, ?, ?) + on conflict (book_id, chapter_id) do update set + text_en = excluded.text_en, + text_zh = excluded.text_zh + """, + (book_id, chapter_id, chapter_text_en, chapter_text_zh), + ) + + # insert p + for en_text, zh_text in zip(en_paragraphs, zh_paragraphs): + char_count = len(en_text) + cur.execute( + """ + insert into paragraphs + (book_id, chapter_id, text_en, text_zh, char_count) + values (?, ?, ?, ?, ?) + """, + (book_id, chapter_id, en_text, zh_text, char_count), + ) + + print( + f"Processed chapter {chapter_id} with {len(en_paragraphs)} paragraphs" + ) + + conn.commit() + print(f"Successfully processed all matching chapters for book {book_id}") + + except Exception as e: + conn.rollback() + print(f"Error processing chapters: {str(e)}") + raise + finally: + conn.close() + + +def match_chapters(epub_dir: str) -> Dict[str, Tuple[List[str], List[str]]]: + """Match and process chapters between English and Chinese directories.""" + base_dir = Path(epub_dir) + en_dir = base_dir / "en" + zh_dir = base_dir / "zh" + + matched_paragraphs = {} + + # Get all English files and sort them + en_files = sorted([f for f in en_dir.glob("*.xhtml")]) + + for en_file in en_files: + # Construct corresponding Chinese filename + chapter_num = re.search(r"(\d{4})", en_file.name).group(1) + zh_file = zh_dir / f"{chapter_num}_.xhtml" + + if not zh_file.exists(): + print(f"Warning: No matching Chinese file for {en_file.name}") + continue + + try: + en_paragraphs, zh_paragraphs = process_chapter_pair(en_file, zh_file) + + # Check for significant mismatch in paragraph counts + # if abs(len(en_paragraphs) - len(zh_paragraphs)) > 5: + # print_debug_info( + # chapter_num, en_file, zh_file, en_paragraphs, zh_paragraphs + # ) + # else: + print(f"Chapter {chapter_num}:") + print(f" English paragraphs: {len(en_paragraphs)}") + print(f" Chinese paragraphs: {len(zh_paragraphs)}") + + # Store results + matched_paragraphs[chapter_num] = (en_paragraphs, zh_paragraphs) + + except Exception as e: + print(f"Error processing chapter {chapter_num}: {str(e)}") + + return matched_paragraphs + + +def main(): + epub_dir = "epubs/1v1h" + matched_chapters = match_chapters(epub_dir) + insert_book_chapters("parallel_texts.db", "1v1h", matched_chapters) + + +if __name__ == "__main__": + main() diff --git a/parallel_text_import.py b/parallel_text_import.py new file mode 100644 index 0000000..e7a633e --- /dev/null +++ b/parallel_text_import.py @@ -0,0 +1,119 @@ +import sqlite3 +import re +from typing import List, Tuple, Dict +from dataclasses import dataclass + + +@dataclass +class TextUnit: + book_id: str + chapter_id: str + text: str + + +def parse_file(filename: str) -> List[TextUnit]: + """Parse the file and return a list of TextUnits.""" + units = [] + current_book = "" + current_chapter = "" + current_text = [] + book_pattern = re.compile(r'') + chapter_pattern = re.compile(r'') + end_pattern = re.compile(r"") + + with open(filename, "r", encoding="utf-8") as f: + for line in f: + # parse BOOK opening tag + book_match = book_pattern.match(line) + if book_match: + current_book = book_match.group(1) + continue + + # parse CHAPTER opening tag + chapter_match = chapter_pattern.match(line) + if chapter_match: + current_chapter = chapter_match.group(1) + current_text = [] + continue + + # on any end tag, save the current chapter + if end_pattern.match(line): + if current_text: + units.append( + TextUnit( + book_id=current_book, + chapter_id=current_chapter, + text="".join(current_text), + ) + ) + continue + + # if line doesn't match any of our known tags, it's content + if not book_pattern.match(line) and not chapter_pattern.match(line): + current_text.append(line) + + return units + + +def create_database(db_name: str = "parallel_texts.db"): + """create schema""" + conn = sqlite3.connect(db_name) + + with open("schema.sql", "r") as f: + conn.executescript(f.read()) + + conn.commit() + return conn + + +def import_texts( + en_units: List[TextUnit], zh_units: List[TextUnit], conn: sqlite3.Connection +): + """import parsed text""" + c = conn.cursor() + + # collect all unique book IDs + book_ids = set(unit.book_id for unit in en_units) + + # insert books + for book_id in book_ids: + c.execute("insert or ignore into books (book_id) values (?)", (book_id,)) + + # create a dict for Chinese texts + zh_dict = {(unit.book_id, unit.chapter_id): unit.text for unit in zh_units} + + # insert chapters with parallel texts + for en_unit in en_units: + zh_text = zh_dict.get((en_unit.book_id, en_unit.chapter_id), "") + c.execute( + """ + insert or replace into chapters (book_id, chapter_id, text_en, text_zh) values (?, ?, ?, ?) + """, + (en_unit.book_id, en_unit.chapter_id, en_unit.text, zh_text), + ) + + conn.commit() + + +def main(): + en_units = parse_file("train.en") + zh_units = parse_file("train.zh") + + # create and populate database + conn = create_database() + import_texts(en_units, zh_units, conn) + + # stats + c = conn.cursor() + c.execute("select count(*) from books") + book_count = c.fetchone()[0] + c.execute("select count(*) from chapters") + chapter_count = c.fetchone()[0] + + print(f"Imported {book_count} books and {chapter_count} chapters.") + + conn.close() + + +if __name__ == "__main__": + main() diff --git a/schema.sql b/schema.sql new file mode 100644 index 0000000..1d9aa45 --- /dev/null +++ b/schema.sql @@ -0,0 +1,23 @@ +create table if not exists books ( + book_id text primary key +); + +create table if not exists chapters ( + id integer primary key autoincrement, + book_id text, + chapter_id text, + text_en text, + text_zh text, + foreign key (book_id) references books(book_id), + unique(book_id, chapter_id) +); + +create table if not exists paragraphs ( + id integer primary key autoincrement, + book_id text not null, + chapter_id text not null, + text_en text, + text_zh text, + char_count integer, + foreign key (book_id, chapter_id) references chapters(book_id, chapter_id) +);