zh-en-wn-dataset/mlx_dataset_gen.py

import sqlite3
import json
from typing import List, Tuple
import unicodedata


def get_chapters(cursor) -> List[Tuple[str, str]]:
    """text length < 36000"""
    query = """
    select text_en, text_zh
    from chapters
    where length(text_en) < 36000
    """
    return cursor.execute(query).fetchall()


def should_join_lines(line: str) -> bool:
    """Check if line should be joined with next line based on ending"""
    line = line.rstrip()
    return line.endswith(",") or (
        line.count('"') % 2 == 1
    )  # odd number of quotes means open quote


def process_text(text: str) -> str:
    """Process text by handling special markings and line breaks"""
    text = unicodedata.normalize("NFKC", text)

    lines = text.strip().split("\n")
    processed_lines = []
    current_group = []
    in_marked_section = False

    for line in lines:
        line = line.strip()
        if not line:
            continue

        if line.startswith("#<#"):
            # Start of marked section - remove marker and store first line
            in_marked_section = True
            first_line = line[3:].strip()  # Remove #<# and whitespace
            current_group.append(first_line)
            continue

        if line.endswith("#>#"):
            # End of marked section - remove marker and store last line
            last_line = line[:-3].strip()  # Remove #># and whitespace
            current_group.append(last_line)
            # Join all collected lines with space and add to processed lines
            processed_lines.append(" ".join(current_group))
            current_group = []
            in_marked_section = False
            continue

        if in_marked_section:
            current_group.append(line)
        else:
            processed_lines.append(line)

    # Handle any remaining grouped lines (in case of malformed input)
    if current_group:
        processed_lines.append(" ".join(current_group))

    # Join with double newlines
    return "\n\n".join(processed_lines)


def create_dataset(db_path: str, output_path: str):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    try:
        chapters = get_chapters(cursor)

        with open(output_path, "w", encoding="utf-8") as f:
            for text_en, text_zh in chapters:
                processed_en = process_text(text_en)
                processed_zh = process_text(text_zh)

                entry = {
                    "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
                }

                f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    finally:
        conn.close()


if __name__ == "__main__":
    DB_PATH = "parallel_texts.db"
    OUTPUT_PATH = "datasets/dataset_v1.jsonl"

    create_dataset(DB_PATH, OUTPUT_PATH)