zh-en-wn-dataset/paragraph_ctx_collect.py

from typing import List, Tuple
import sqlite3
import re


def get_chapter_paragraphs(
    cursor: sqlite3.Cursor, book_id: str, chapter_id: str
) -> Tuple[List[str], List[str]]:
    """
    Gets all paragraphs for a specific chapter.
    Returns (english_paragraphs, chinese_paragraphs).
    """
    cursor.execute(
        """
        select text_en, text_zh
        from paragraphs
        where book_id = ? and chapter_id = ?
    """,
        (book_id, chapter_id),
    )

    en_texts = []
    zh_texts = []
    for en, zh in cursor.fetchall():
        if en and zh:  # Skip empty paragraphs
            en_texts.append(en.strip())
            zh_texts.append(zh.strip())

    return en_texts, zh_texts


def get_text_state(text: str) -> tuple[int, bool, bool]:
    """
    Analyzes text for continuity markers
    Returns (bracket_change, ends_with_colon, incomplete_sentence)

    Args:
        text: String to analyze

    Returns:
        tuple containing:
        - int: Net change in bracket depth (positive for unclosed, negative for extra closing)
        - bool: Whether the text ends with a colon
        - bool: Whether the text ends without proper sentence termination
    """
    if not text:
        return 0, False, False

    # count bracket balance
    opens = len(re.findall(r"[【「『]", text))
    closes = len(re.findall(r"[】」』]", text))
    ends_with_punct = bool(re.search(r"[.!?。！？]\s*$", text.rstrip()))

    return (opens - closes, text.rstrip().endswith(":"), not ends_with_punct)


def create_chunks(
    en_texts: List[str],
    zh_texts: List[str],
    target_size: int = 1024,
    min_size: int = 512,
    max_size: int = 2048,
) -> List[Tuple[str, str]]:
    """
    Creates parallel text chunks respecting continuity markers and size constraints

    Args:
        en_texts: List of English text paragraphs
        zh_texts: List of corresponding Chinese text paragraphs
        target_size: Ideal size for each chunk in characters
        min_size: Minimum acceptable chunk size
        max_size: Maximum acceptable chunk size

    Returns:
        List of tuples containing (english_chunk, chinese_chunk)
    """
    chunks = []
    current_en = []
    current_zh = []
    current_chars = 0
    bracket_depth = 0

    i = 0
    while i < len(en_texts):
        current_text = en_texts[i]
        para_chars = len(current_text)
        bracket_change, ends_with_colon, incomplete_sentence = get_text_state(
            current_text
        )
        bracket_depth += bracket_change

        # check if adding would exceed max_size
        if current_chars + para_chars > max_size:
            # only split if we're not in brackets, sentence is complete, and have met min_size
            if (
                bracket_depth <= 0
                and not incomplete_sentence
                and current_chars >= min_size
            ):
                chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
                current_en = []
                current_zh = []
                current_chars = 0

        # add cur p
        current_en.append(current_text)
        current_zh.append(zh_texts[i])
        current_chars += para_chars

        # can we create a chunk?
        next_exists = i + 1 < len(en_texts)
        if (
            current_chars >= target_size
            and bracket_depth <= 0
            and not ends_with_colon
            and not incomplete_sentence
            and next_exists
        ):
            chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
            current_en = []
            current_zh = []
            current_chars = 0
            bracket_depth = 0

        i += 1

    # add remaining text if it it's min_size
    if current_chars >= min_size:
        chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))

    return chunks


def create_chunk_table(cursor: sqlite3.Cursor):
    """Creates the paragraph_chunks table if it doesn't exist"""
    cursor.execute(
        """
        create table if not exists paragraph_chunks (
            id integer primary key autoincrement,
            book_id text not null,
            chapter_id text not null,
            chunk_index integer not null,
            text_en text,
            text_zh text,
            char_count integer,
            foreign key (book_id, chapter_id) references chapters(book_id, chapter_id),
            unique(book_id, chapter_id, chunk_index)
        )
    """
    )


def store_book_chunks(db_path: str, book_id: str):
    """Process a book and store its chunks in the database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    create_chunk_table(cursor)
    chunks_by_chapter = process_book(db_path, book_id)

    for chapter_id, chapter_chunks in chunks_by_chapter:
        for i, (en_chunk, zh_chunk) in enumerate(chapter_chunks):
            cursor.execute(
                """
                insert into paragraph_chunks
                (book_id, chapter_id, chunk_index, text_en, text_zh, char_count)
                values (?, ?, ?, ?, ?, ?)
                on conflict(book_id, chapter_id, chunk_index)
                do update set
                    text_en = excluded.text_en,
                    text_zh = excluded.text_zh,
                    char_count = excluded.char_count
            """,
                (book_id, chapter_id, i, en_chunk, zh_chunk, len(en_chunk)),
            )

    conn.commit()
    conn.close()


def process_book(db_path: str, book_id: str) -> List[Tuple[str, List[Tuple[str, str]]]]:
    """
    Process book chapter by chapter, respecting chapter boundaries
    Returns list of (chapter_id, chapter_chunks) tuples
    """
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute(
        """
        select distinct chapter_id
        from paragraphs
        where book_id = ?
        order by chapter_id
        """,
        (book_id,),
    )

    chapter_ids = [row[0] for row in cursor.fetchall()]
    all_chapter_chunks = []

    for chapter_id in chapter_ids:
        en_texts, zh_texts = get_chapter_paragraphs(cursor, book_id, chapter_id)
        if en_texts and zh_texts:  # skip empty chapters
            chapter_chunks = create_chunks(en_texts, zh_texts)
            all_chapter_chunks.append((chapter_id, chapter_chunks))

    conn.close()
    return all_chapter_chunks


def process_all_books(db_path: str):
    """Process and store chunks for all books in database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("select book_id from books")
    book_ids = [row[0] for row in cursor.fetchall()]
    conn.close()

    for book_id in book_ids:
        print(f"Processing and storing book: {book_id}")
        store_book_chunks(db_path, book_id)


if __name__ == "__main__":
    import sys

    if len(sys.argv) == 3 and sys.argv[1] == "--store":
        db_path = sys.argv[2]
        process_all_books(db_path)

    else:
        # test
        test_en = [
            "On it were words left by Wen Jin's parents:",
            "【We learned from the news that you two got married.",
            "Take care of each other in the future, if you need anything,",
            "talk to us, even though you may not need to.",
            "From Mom and Dad.】",
            "After reading this, Wen Jin felt:",
            "A complex mix of emotions surged through him.",
            'Returning home with the parcels, Jiang Wan asked him, "Should the shoes be unpacked?"',
        ]
        test_zh = ["zh" + str(i) for i in range(len(test_en))]

        chunks = create_chunks(test_en, test_zh, target_size=1024)
        for i, (en, zh) in enumerate(chunks, 1):
            print(f"\nChunk {i}:")
            print(en)
            print("-" * 40)