import os from bs4 import BeautifulSoup import re import sqlite3 from pathlib import Path from typing import List, Tuple, Dict def clean_text(text: str) -> str: """normalize whitespace and line end""" return re.sub(r"\s+", " ", text).strip() def extract_zh_paragraphs(soup: BeautifulSoup) -> List[str]: """extract paragraphs from zh HTML""" if h1_tag := soup.find("h1"): h1_tag.decompose() for br in soup.find_all("br"): br.replace_with("\n") content = soup.body.get_text() paragraphs = [clean_text(p) for p in content.split("\n\n") if clean_text(p)] return paragraphs def extract_en_paragraphs(soup: BeautifulSoup) -> List[str]: """etract paragraphs from English HTML""" if h1_tag := soup.find("h1"): h1_tag.decompose() for footnote in soup.find_all("span", id=re.compile(r"easy-footnote.*")): footnote.decompose() paragraphs = [ clean_text(p.get_text()) for p in soup.find_all("p") if clean_text(p.get_text()) ] return paragraphs def print_debug_info( chapter_num: str, en_file: Path, zh_file: Path, en_paragraphs: List[str], zh_paragraphs: List[str], ): """debug""" print(f"\n=== MISMATCH DETECTED IN CHAPTER {chapter_num} ===") print(f"English file: {en_file}") print(f"Chinese file: {zh_file}") print(f"\nParagraph count:") print(f" English: {len(en_paragraphs)}") print(f" Chinese: {len(zh_paragraphs)}") print("\nFirst 3 English p:") for i, p in enumerate(en_paragraphs[:3]): print(f" {i+1}: {p[:100]}...") print("\nFirst 3 Chinese p:") for i, p in enumerate(zh_paragraphs[:3]): print(f" {i+1}: {p[:100]}...") print("\nRaw Chinese HTML:") with open(zh_file, "r", encoding="utf-8") as f: content = f.read() print(content[:500]) def process_chapter_pair(en_path: Path, zh_path: Path) -> Tuple[List[str], List[str]]: """Process a pair of corresponding chapter files""" with open(en_path, "r", encoding="utf-8") as f: en_soup = BeautifulSoup(f, "html.parser") en_paragraphs = extract_en_paragraphs(en_soup) with open(zh_path, "r", encoding="utf-8") as f: zh_soup = BeautifulSoup(f, "html.parser") zh_paragraphs = extract_zh_paragraphs(zh_soup) return en_paragraphs, zh_paragraphs def insert_book_chapters( db_path: str, book_id: str, matched_chapters: Dict[str, Tuple[List[str], List[str]]] ): """ Insert chapters and paragraphs into the database for a given book_id. Only inserts when English and Chinese paragraph counts match. """ conn = sqlite3.connect(db_path) cur = conn.cursor() try: cur.execute("insert or ignore into books (book_id) values (?)", (book_id,)) for chapter_id, (en_paragraphs, zh_paragraphs) in matched_chapters.items(): # only process if paragraph counts match if len(en_paragraphs) != len(zh_paragraphs): print( f"Skipping chapter {chapter_id} due to paragraph count mismatch: " f"EN: {len(en_paragraphs)}, ZH: {len(zh_paragraphs)}" ) continue # join paragraphs for chapter text chapter_text_en = "\n".join(en_paragraphs) chapter_text_zh = "\n".join(zh_paragraphs) cur.execute( """ insert into chapters (book_id, chapter_id, text_en, text_zh) values (?, ?, ?, ?) on conflict (book_id, chapter_id) do update set text_en = excluded.text_en, text_zh = excluded.text_zh """, (book_id, chapter_id, chapter_text_en, chapter_text_zh), ) # insert p for en_text, zh_text in zip(en_paragraphs, zh_paragraphs): char_count = len(en_text) cur.execute( """ insert into paragraphs (book_id, chapter_id, text_en, text_zh, char_count) values (?, ?, ?, ?, ?) """, (book_id, chapter_id, en_text, zh_text, char_count), ) print( f"Processed chapter {chapter_id} with {len(en_paragraphs)} paragraphs" ) conn.commit() print(f"Successfully processed all matching chapters for book {book_id}") except Exception as e: conn.rollback() print(f"Error processing chapters: {str(e)}") raise finally: conn.close() def match_chapters(epub_dir: str) -> Dict[str, Tuple[List[str], List[str]]]: """Match and process chapters between English and Chinese directories.""" base_dir = Path(epub_dir) en_dir = base_dir / "en" zh_dir = base_dir / "zh" matched_paragraphs = {} # Get all English files and sort them en_files = sorted([f for f in en_dir.glob("*.xhtml")]) for en_file in en_files: # Construct corresponding Chinese filename chapter_num = re.search(r"(\d{4})", en_file.name).group(1) zh_file = zh_dir / f"{chapter_num}_.xhtml" if not zh_file.exists(): print(f"Warning: No matching Chinese file for {en_file.name}") continue try: en_paragraphs, zh_paragraphs = process_chapter_pair(en_file, zh_file) # Check for significant mismatch in paragraph counts # if abs(len(en_paragraphs) - len(zh_paragraphs)) > 5: # print_debug_info( # chapter_num, en_file, zh_file, en_paragraphs, zh_paragraphs # ) # else: print(f"Chapter {chapter_num}:") print(f" English paragraphs: {len(en_paragraphs)}") print(f" Chinese paragraphs: {len(zh_paragraphs)}") # Store results matched_paragraphs[chapter_num] = (en_paragraphs, zh_paragraphs) except Exception as e: print(f"Error processing chapter {chapter_num}: {str(e)}") return matched_paragraphs def main(): epub_dir = "epubs/1v1h" matched_chapters = match_chapters(epub_dir) insert_book_chapters("parallel_texts.db", "1v1h", matched_chapters) if __name__ == "__main__": main()