first commit

2025-02-09 03:07:07 +06:00
commit d060cdba14
6 changed files with 737 additions and 0 deletions
--- a/paragraph_split_custom_zh.py
+++ b/paragraph_split_custom_zh.py
@@ -0,0 +1,198 @@
+import os
+from bs4 import BeautifulSoup
+import re
+import sqlite3
+from pathlib import Path
+from typing import List, Tuple, Dict
+
+
+def clean_text(text: str) -> str:
+    """normalize whitespace and line end"""
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def extract_zh_paragraphs(soup: BeautifulSoup) -> List[str]:
+    """extract paragraphs from zh HTML"""
+    if h1_tag := soup.find("h1"):
+        h1_tag.decompose()
+
+    for br in soup.find_all("br"):
+        br.replace_with("\n")
+
+    content = soup.body.get_text()
+    paragraphs = [clean_text(p) for p in content.split("\n\n") if clean_text(p)]
+
+    return paragraphs
+
+
+def extract_en_paragraphs(soup: BeautifulSoup) -> List[str]:
+    """etract paragraphs from English HTML"""
+    if h1_tag := soup.find("h1"):
+        h1_tag.decompose()
+
+    for footnote in soup.find_all("span", id=re.compile(r"easy-footnote.*")):
+        footnote.decompose()
+
+    paragraphs = [
+        clean_text(p.get_text()) for p in soup.find_all("p") if clean_text(p.get_text())
+    ]
+
+    return paragraphs
+
+
+def print_debug_info(
+    chapter_num: str,
+    en_file: Path,
+    zh_file: Path,
+    en_paragraphs: List[str],
+    zh_paragraphs: List[str],
+):
+    """debug"""
+    print(f"\n=== MISMATCH DETECTED IN CHAPTER {chapter_num} ===")
+    print(f"English file: {en_file}")
+    print(f"Chinese file: {zh_file}")
+    print(f"\nParagraph count:")
+    print(f"  English: {len(en_paragraphs)}")
+    print(f"  Chinese: {len(zh_paragraphs)}")
+
+    print("\nFirst 3 English p:")
+    for i, p in enumerate(en_paragraphs[:3]):
+        print(f"  {i+1}: {p[:100]}...")
+
+    print("\nFirst 3 Chinese p:")
+    for i, p in enumerate(zh_paragraphs[:3]):
+        print(f"  {i+1}: {p[:100]}...")
+
+    print("\nRaw Chinese HTML:")
+    with open(zh_file, "r", encoding="utf-8") as f:
+        content = f.read()
+        print(content[:500])
+
+
+def process_chapter_pair(en_path: Path, zh_path: Path) -> Tuple[List[str], List[str]]:
+    """Process a pair of corresponding chapter files"""
+    with open(en_path, "r", encoding="utf-8") as f:
+        en_soup = BeautifulSoup(f, "html.parser")
+    en_paragraphs = extract_en_paragraphs(en_soup)
+
+    with open(zh_path, "r", encoding="utf-8") as f:
+        zh_soup = BeautifulSoup(f, "html.parser")
+    zh_paragraphs = extract_zh_paragraphs(zh_soup)
+
+    return en_paragraphs, zh_paragraphs
+
+
+def insert_book_chapters(
+    db_path: str, book_id: str, matched_chapters: Dict[str, Tuple[List[str], List[str]]]
+):
+    """
+    Insert chapters and paragraphs into the database for a given book_id.
+    Only inserts when English and Chinese paragraph counts match.
+    """
+    conn = sqlite3.connect(db_path)
+    cur = conn.cursor()
+
+    try:
+        cur.execute("insert or ignore into books (book_id) values (?)", (book_id,))
+
+        for chapter_id, (en_paragraphs, zh_paragraphs) in matched_chapters.items():
+            # only process if paragraph counts match
+            if len(en_paragraphs) != len(zh_paragraphs):
+                print(
+                    f"Skipping chapter {chapter_id} due to paragraph count mismatch: "
+                    f"EN: {len(en_paragraphs)}, ZH: {len(zh_paragraphs)}"
+                )
+                continue
+
+            # join paragraphs for chapter text
+            chapter_text_en = "\n".join(en_paragraphs)
+            chapter_text_zh = "\n".join(zh_paragraphs)
+
+            cur.execute(
+                """
+                insert into chapters (book_id, chapter_id, text_en, text_zh)
+                values (?, ?, ?, ?)
+                on conflict (book_id, chapter_id) do update set
+                    text_en = excluded.text_en,
+                    text_zh = excluded.text_zh
+            """,
+                (book_id, chapter_id, chapter_text_en, chapter_text_zh),
+            )
+
+            # insert p
+            for en_text, zh_text in zip(en_paragraphs, zh_paragraphs):
+                char_count = len(en_text)
+                cur.execute(
+                    """
+                    insert into paragraphs 
+                    (book_id, chapter_id, text_en, text_zh, char_count)
+                    values (?, ?, ?, ?, ?)
+                """,
+                    (book_id, chapter_id, en_text, zh_text, char_count),
+                )
+
+            print(
+                f"Processed chapter {chapter_id} with {len(en_paragraphs)} paragraphs"
+            )
+
+        conn.commit()
+        print(f"Successfully processed all matching chapters for book {book_id}")
+
+    except Exception as e:
+        conn.rollback()
+        print(f"Error processing chapters: {str(e)}")
+        raise
+    finally:
+        conn.close()
+
+
+def match_chapters(epub_dir: str) -> Dict[str, Tuple[List[str], List[str]]]:
+    """Match and process chapters between English and Chinese directories."""
+    base_dir = Path(epub_dir)
+    en_dir = base_dir / "en"
+    zh_dir = base_dir / "zh"
+
+    matched_paragraphs = {}
+
+    # Get all English files and sort them
+    en_files = sorted([f for f in en_dir.glob("*.xhtml")])
+
+    for en_file in en_files:
+        # Construct corresponding Chinese filename
+        chapter_num = re.search(r"(\d{4})", en_file.name).group(1)
+        zh_file = zh_dir / f"{chapter_num}_.xhtml"
+
+        if not zh_file.exists():
+            print(f"Warning: No matching Chinese file for {en_file.name}")
+            continue
+
+        try:
+            en_paragraphs, zh_paragraphs = process_chapter_pair(en_file, zh_file)
+
+            # Check for significant mismatch in paragraph counts
+            # if abs(len(en_paragraphs) - len(zh_paragraphs)) > 5:
+            #     print_debug_info(
+            #         chapter_num, en_file, zh_file, en_paragraphs, zh_paragraphs
+            #     )
+            # else:
+            print(f"Chapter {chapter_num}:")
+            print(f"  English paragraphs: {len(en_paragraphs)}")
+            print(f"  Chinese paragraphs: {len(zh_paragraphs)}")
+
+            # Store results
+            matched_paragraphs[chapter_num] = (en_paragraphs, zh_paragraphs)
+
+        except Exception as e:
+            print(f"Error processing chapter {chapter_num}: {str(e)}")
+
+    return matched_paragraphs
+
+
+def main():
+    epub_dir = "epubs/1v1h"
+    matched_chapters = match_chapters(epub_dir)
+    insert_book_chapters("parallel_texts.db", "1v1h", matched_chapters)
+
+
+if __name__ == "__main__":
+    main()