chore: json dataset

2025-02-09 14:30:25 +06:00
parent 94babaa7aa
commit fd380e250d
4 changed files with 13827 additions and 0 deletions
--- a/cn_en_wn_dataset.json
+++ b/cn_en_wn_dataset.json
--- a/gen_alpaca.py
+++ b/gen_alpaca.py
@@ -0,0 +1,89 @@
+import sqlite3
+import json
+import random
+from typing import List, Dict, Any
+from pathlib import Path
+
+
+def create_alpaca_dataset(
+    db_path: str, output_path: str, samples_per_book: int = 155
+) -> None:
+    """
+    Create an Alpaca-style JSON dataset for Chinese to English translation.
+
+    Args:
+        db_path: Path to the SQLite database
+        output_path: Path where the JSON dataset will be saved
+        samples_per_book: Maximum number of samples to take from each book_id
+    """
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute(
+        "select distinct book_id from paragraph_chunks where text_en is not null and text_zh is not null"
+    )
+    book_ids = [row[0] for row in cursor.fetchall()]
+
+    dataset: List[Dict[str, Any]] = []
+
+    for book_id in book_ids:
+        # get samples for current book_id
+        cursor.execute(
+            """
+            select text_zh, text_en 
+            from paragraph_chunks 
+            where book_id = ? 
+            and text_en is not null 
+            and text_zh is not null 
+            and length(text_zh) > 0 
+            and length(text_en) > 0
+        """,
+            (book_id,),
+        )
+
+        samples = cursor.fetchall()
+        if not samples:
+            continue
+        selected_samples = random.sample(samples, min(len(samples), samples_per_book))
+        # Alpaca foramt
+        for zh_text, en_text in selected_samples:
+            entry = {
+                "instruction": "Translate the following Chinese text to English:",
+                "input": zh_text.strip(),
+                "output": en_text.strip(),
+            }
+            dataset.append(entry)
+
+    conn.close()
+    random.shuffle(dataset)
+
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(dataset, f, ensure_ascii=False, indent=2)
+
+    print(f"Dataset created successfully with {len(dataset)} total samples")
+    print(f"Number of unique books: {len(book_ids)}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Generate Alpaca-style translation dataset"
+    )
+    parser.add_argument(
+        "--db_path", type=str, required=True, help="Path to SQLite database"
+    )
+    parser.add_argument(
+        "--output_path", type=str, required=True, help="Path for output JSON file"
+    )
+    parser.add_argument(
+        "--samples_per_book",
+        type=int,
+        default=155,
+        help="Maximum number of samples to take from each book_id",
+    )
+
+    args = parser.parse_args()
+    create_alpaca_dataset(args.db_path, args.output_path, args.samples_per_book)
--- a/paragraph_ctx_collect.py
+++ b/paragraph_ctx_collect.py
@@ -0,0 +1,249 @@
+from typing import List, Tuple
+import sqlite3
+import re
+
+
+def get_chapter_paragraphs(
+    cursor: sqlite3.Cursor, book_id: str, chapter_id: str
+) -> Tuple[List[str], List[str]]:
+    """
+    Gets all paragraphs for a specific chapter.
+    Returns (english_paragraphs, chinese_paragraphs).
+    """
+    cursor.execute(
+        """
+        select text_en, text_zh 
+        from paragraphs 
+        where book_id = ? and chapter_id = ?
+    """,
+        (book_id, chapter_id),
+    )
+
+    en_texts = []
+    zh_texts = []
+    for en, zh in cursor.fetchall():
+        if en and zh:  # Skip empty paragraphs
+            en_texts.append(en.strip())
+            zh_texts.append(zh.strip())
+
+    return en_texts, zh_texts
+
+
+def get_text_state(text: str) -> tuple[int, bool, bool]:
+    """
+    Analyzes text for continuity markers
+    Returns (bracket_change, ends_with_colon, incomplete_sentence)
+
+    Args:
+        text: String to analyze
+
+    Returns:
+        tuple containing:
+        - int: Net change in bracket depth (positive for unclosed, negative for extra closing)
+        - bool: Whether the text ends with a colon
+        - bool: Whether the text ends without proper sentence termination
+    """
+    if not text:
+        return 0, False, False
+
+    # count bracket balance
+    opens = len(re.findall(r"[【「『]", text))
+    closes = len(re.findall(r"[】」』]", text))
+    ends_with_punct = bool(re.search(r"[.!?。！？]\s*$", text.rstrip()))
+
+    return (opens - closes, text.rstrip().endswith(":"), not ends_with_punct)
+
+
+def create_chunks(
+    en_texts: List[str],
+    zh_texts: List[str],
+    target_size: int = 1024,
+    min_size: int = 512,
+    max_size: int = 2048,
+) -> List[Tuple[str, str]]:
+    """
+    Creates parallel text chunks respecting continuity markers and size constraints
+
+    Args:
+        en_texts: List of English text paragraphs
+        zh_texts: List of corresponding Chinese text paragraphs
+        target_size: Ideal size for each chunk in characters
+        min_size: Minimum acceptable chunk size
+        max_size: Maximum acceptable chunk size
+
+    Returns:
+        List of tuples containing (english_chunk, chinese_chunk)
+    """
+    chunks = []
+    current_en = []
+    current_zh = []
+    current_chars = 0
+    bracket_depth = 0
+
+    i = 0
+    while i < len(en_texts):
+        current_text = en_texts[i]
+        para_chars = len(current_text)
+        bracket_change, ends_with_colon, incomplete_sentence = get_text_state(
+            current_text
+        )
+        bracket_depth += bracket_change
+
+        # check if adding would exceed max_size
+        if current_chars + para_chars > max_size:
+            # only split if we're not in brackets, sentence is complete, and have met min_size
+            if (
+                bracket_depth <= 0
+                and not incomplete_sentence
+                and current_chars >= min_size
+            ):
+                chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
+                current_en = []
+                current_zh = []
+                current_chars = 0
+
+        # add cur p
+        current_en.append(current_text)
+        current_zh.append(zh_texts[i])
+        current_chars += para_chars
+
+        # can we create a chunk?
+        next_exists = i + 1 < len(en_texts)
+        if (
+            current_chars >= target_size
+            and bracket_depth <= 0
+            and not ends_with_colon
+            and not incomplete_sentence
+            and next_exists
+        ):
+            chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
+            current_en = []
+            current_zh = []
+            current_chars = 0
+            bracket_depth = 0
+
+        i += 1
+
+    # add remaining text if it it's min_size
+    if current_chars >= min_size:
+        chunks.append(("\n\n".join(current_en), "\n\n".join(current_zh)))
+
+    return chunks
+
+
+def create_chunk_table(cursor: sqlite3.Cursor):
+    """Creates the paragraph_chunks table if it doesn't exist"""
+    cursor.execute(
+        """
+        create table if not exists paragraph_chunks (
+            id integer primary key autoincrement,
+            book_id text not null,
+            chapter_id text not null,
+            chunk_index integer not null,
+            text_en text,
+            text_zh text,
+            char_count integer,
+            foreign key (book_id, chapter_id) references chapters(book_id, chapter_id),
+            unique(book_id, chapter_id, chunk_index)
+        )
+    """
+    )
+
+
+def store_book_chunks(db_path: str, book_id: str):
+    """Process a book and store its chunks in the database"""
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    create_chunk_table(cursor)
+    chunks_by_chapter = process_book(db_path, book_id)
+
+    for chapter_id, chapter_chunks in chunks_by_chapter:
+        for i, (en_chunk, zh_chunk) in enumerate(chapter_chunks):
+            cursor.execute(
+                """
+                insert into paragraph_chunks 
+                (book_id, chapter_id, chunk_index, text_en, text_zh, char_count)
+                values (?, ?, ?, ?, ?, ?)
+                on conflict(book_id, chapter_id, chunk_index) 
+                do update set 
+                    text_en = excluded.text_en,
+                    text_zh = excluded.text_zh,
+                    char_count = excluded.char_count
+            """,
+                (book_id, chapter_id, i, en_chunk, zh_chunk, len(en_chunk)),
+            )
+
+    conn.commit()
+    conn.close()
+
+
+def process_book(db_path: str, book_id: str) -> List[Tuple[str, List[Tuple[str, str]]]]:
+    """
+    Process book chapter by chapter, respecting chapter boundaries
+    Returns list of (chapter_id, chapter_chunks) tuples
+    """
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+
+    cursor.execute(
+        """
+        select distinct chapter_id 
+        from paragraphs 
+        where book_id = ?
+        order by chapter_id
+        """,
+        (book_id,),
+    )
+
+    chapter_ids = [row[0] for row in cursor.fetchall()]
+    all_chapter_chunks = []
+
+    for chapter_id in chapter_ids:
+        en_texts, zh_texts = get_chapter_paragraphs(cursor, book_id, chapter_id)
+        if en_texts and zh_texts:  # skip empty chapters
+            chapter_chunks = create_chunks(en_texts, zh_texts)
+            all_chapter_chunks.append((chapter_id, chapter_chunks))
+
+    conn.close()
+    return all_chapter_chunks
+
+
+def process_all_books(db_path: str):
+    """Process and store chunks for all books in database"""
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute("select book_id from books")
+    book_ids = [row[0] for row in cursor.fetchall()]
+    conn.close()
+
+    for book_id in book_ids:
+        print(f"Processing and storing book: {book_id}")
+        store_book_chunks(db_path, book_id)
+
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) == 3 and sys.argv[1] == "--store":
+        db_path = sys.argv[2]
+        process_all_books(db_path)
+
+    else:
+        # test
+        test_en = [
+            "On it were words left by Wen Jin's parents:",
+            "【We learned from the news that you two got married.",
+            "Take care of each other in the future, if you need anything,",
+            "talk to us, even though you may not need to.",
+            "From Mom and Dad.】",
+            "After reading this, Wen Jin felt:",
+            "A complex mix of emotions surged through him.",
+            'Returning home with the parcels, Jiang Wan asked him, "Should the shoes be unpacked?"',
+        ]
+        test_zh = ["zh" + str(i) for i in range(len(test_en))]
+
+        chunks = create_chunks(test_en, test_zh, target_size=1024)
+        for i, (en, zh) in enumerate(chunks, 1):
+            print(f"\nChunk {i}:")
+            print(en)
+            print("-" * 40)
--- a/schema.sql
+++ b/schema.sql
@@ -21,3 +21,15 @@ create table if not exists paragraphs (
  char_count integer,
  foreign key (book_id, chapter_id) references chapters(book_id, chapter_id)
 );
+
+create table if not exists paragraph_chunks (
+  id integer primary key autoincrement,
+  book_id text not null,
+  chapter_id text not null,
+  chunk_index integer not null,
+  text_en text,
+  text_zh text,
+  char_count integer,
+  foreign key (book_id, chapter_id) references chapters(book_id, chapter_id),
+  unique(book_id, chapter_id, chunk_index)
+);