zh-en-wn-dataset/paragraph_split_custom_zh.py

import os
from bs4 import BeautifulSoup
import re
import sqlite3
from pathlib import Path
from typing import List, Tuple, Dict


def clean_text(text: str) -> str:
    """normalize whitespace and line end"""
    return re.sub(r"\s+", " ", text).strip()


def extract_zh_paragraphs(soup: BeautifulSoup) -> List[str]:
    """
    most chinese raws are split with 2 br tags rather than
    by <p> elements so.. yeah
    """
    if h1_tag := soup.find("h1"):
        h1_tag.decompose()

    for br in soup.find_all("br"):
        br.replace_with("\n")

    content = soup.body.get_text()
    paragraphs = [clean_text(p) for p in content.split("\n\n") if clean_text(p)]

    return paragraphs


def extract_en_paragraphs(soup: BeautifulSoup) -> List[str]:
    """etract paragraphs from English HTML"""
    if h1_tag := soup.find("h1"):
        h1_tag.decompose()

    for footnote in soup.find_all("span", id=re.compile(r"easy-footnote.*")):
        footnote.decompose()

    paragraphs = [
        clean_text(p.get_text()) for p in soup.find_all("p") if clean_text(p.get_text())
    ]

    return paragraphs


def print_debug_info(
    chapter_num: str,
    en_file: Path,
    zh_file: Path,
    en_paragraphs: List[str],
    zh_paragraphs: List[str],
):
    """debug"""
    print(f"\n=== MISMATCH DETECTED IN CHAPTER {chapter_num} ===")
    print(f"English file: {en_file}")
    print(f"Chinese file: {zh_file}")
    print(f"\nParagraph count:")
    print(f"  English: {len(en_paragraphs)}")
    print(f"  Chinese: {len(zh_paragraphs)}")

    print("\nFirst 3 English p:")
    for i, p in enumerate(en_paragraphs[:3]):
        print(f"  {i+1}: {p[:100]}...")

    print("\nFirst 3 Chinese p:")
    for i, p in enumerate(zh_paragraphs[:3]):
        print(f"  {i+1}: {p[:100]}...")

    print("\nRaw Chinese HTML:")
    with open(zh_file, "r", encoding="utf-8") as f:
        content = f.read()
        print(content[:500])


def process_chapter_pair(en_path: Path, zh_path: Path) -> Tuple[List[str], List[str]]:
    """Process a pair of corresponding chapter files"""
    with open(en_path, "r", encoding="utf-8") as f:
        en_soup = BeautifulSoup(f, "html.parser")
    en_paragraphs = extract_en_paragraphs(en_soup)

    with open(zh_path, "r", encoding="utf-8") as f:
        zh_soup = BeautifulSoup(f, "html.parser")
    zh_paragraphs = extract_zh_paragraphs(zh_soup)

    return en_paragraphs, zh_paragraphs


def insert_book_chapters(
    db_path: str, book_id: str, matched_chapters: Dict[str, Tuple[List[str], List[str]]]
):
    """
    Insert chapters and paragraphs into the database for a given book_id.
    Only inserts when English and Chinese paragraph counts match.
    """
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    try:
        cur.execute("insert or ignore into books (book_id) values (?)", (book_id,))

        for chapter_id, (en_paragraphs, zh_paragraphs) in matched_chapters.items():
            # only process if paragraph counts match
            if len(en_paragraphs) != len(zh_paragraphs):
                print(
                    f"Skipping chapter {chapter_id} due to paragraph count mismatch: "
                    f"EN: {len(en_paragraphs)}, ZH: {len(zh_paragraphs)}"
                )
                continue

            # join paragraphs for chapter text
            chapter_text_en = "\n".join(en_paragraphs)
            chapter_text_zh = "\n".join(zh_paragraphs)

            cur.execute(
                """
                insert into chapters (book_id, chapter_id, text_en, text_zh)
                values (?, ?, ?, ?)
                on conflict (book_id, chapter_id) do update set
                    text_en = excluded.text_en,
                    text_zh = excluded.text_zh
            """,
                (book_id, chapter_id, chapter_text_en, chapter_text_zh),
            )

            # insert p
            for en_text, zh_text in zip(en_paragraphs, zh_paragraphs):
                char_count = len(en_text)
                cur.execute(
                    """
                    insert into paragraphs
                    (book_id, chapter_id, text_en, text_zh, char_count)
                    values (?, ?, ?, ?, ?)
                """,
                    (book_id, chapter_id, en_text, zh_text, char_count),
                )

            print(
                f"Processed chapter {chapter_id} with {len(en_paragraphs)} paragraphs"
            )

        conn.commit()
        print(f"Successfully processed all matching chapters for book {book_id}")

    except Exception as e:
        conn.rollback()
        print(f"Error processing chapters: {str(e)}")
        raise
    finally:
        conn.close()


def match_chapters(epub_dir: str) -> Dict[str, Tuple[List[str], List[str]]]:
    """Match and process chapters between English and Chinese directories."""
    base_dir = Path(epub_dir)
    en_dir = base_dir / "en"
    zh_dir = base_dir / "zh"

    matched_paragraphs = {}

    # Get all English files and sort them
    en_files = sorted([f for f in en_dir.glob("*.xhtml")])

    for en_file in en_files:
        # Construct corresponding Chinese filename
        chapter_num = re.search(r"(\d{4})", en_file.name).group(1)
        zh_file = zh_dir / f"{chapter_num}_.xhtml"

        if not zh_file.exists():
            print(f"Warning: No matching Chinese file for {en_file.name}")
            continue

        try:
            en_paragraphs, zh_paragraphs = process_chapter_pair(en_file, zh_file)

            # Check for significant mismatch in paragraph counts
            # if abs(len(en_paragraphs) - len(zh_paragraphs)) > 5:
            #     print_debug_info(
            #         chapter_num, en_file, zh_file, en_paragraphs, zh_paragraphs
            #     )
            # else:
            print(f"Chapter {chapter_num}:")
            print(f"  English paragraphs: {len(en_paragraphs)}")
            print(f"  Chinese paragraphs: {len(zh_paragraphs)}")

            # Store results
            matched_paragraphs[chapter_num] = (en_paragraphs, zh_paragraphs)

        except Exception as e:
            print(f"Error processing chapter {chapter_num}: {str(e)}")

    return matched_paragraphs


def main():
    epub_dir = "epubs/1v1h"
    matched_chapters = match_chapters(epub_dir)
    insert_book_chapters("parallel_texts.db", "1v1h", matched_chapters)


if __name__ == "__main__":
    main()