chore: haha

2025-02-11 03:25:03 +06:00
parent 9746aad58a
commit 28342e0ace
3920 changed files with 1234726 additions and 15356 deletions
--- a/mlx_dataset_gen.py
+++ b/mlx_dataset_gen.py
@@ -0,0 +1,95 @@
+import sqlite3
+import json
+from typing import List, Tuple
+import unicodedata
+
+
+def get_chapters(cursor) -> List[Tuple[str, str]]:
+    """text length < 36000"""
+    query = """
+    select text_en, text_zh 
+    from chapters 
+    where length(text_en) < 36000
+    """
+    return cursor.execute(query).fetchall()
+
+
+def should_join_lines(line: str) -> bool:
+    """Check if line should be joined with next line based on ending"""
+    line = line.rstrip()
+    return line.endswith(",") or (
+        line.count('"') % 2 == 1
+    )  # odd number of quotes means open quote
+
+
+def process_text(text: str) -> str:
+    """Process text by handling special markings and line breaks"""
+    text = unicodedata.normalize("NFKC", text)
+
+    lines = text.strip().split("\n")
+    processed_lines = []
+    current_group = []
+    in_marked_section = False
+
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+
+        if line.startswith("#<#"):
+            # Start of marked section - remove marker and store first line
+            in_marked_section = True
+            first_line = line[3:].strip()  # Remove #<# and whitespace
+            current_group.append(first_line)
+            continue
+
+        if line.endswith("#>#"):
+            # End of marked section - remove marker and store last line
+            last_line = line[:-3].strip()  # Remove #># and whitespace
+            current_group.append(last_line)
+            # Join all collected lines with space and add to processed lines
+            processed_lines.append(" ".join(current_group))
+            current_group = []
+            in_marked_section = False
+            continue
+
+        if in_marked_section:
+            current_group.append(line)
+        else:
+            processed_lines.append(line)
+
+    # Handle any remaining grouped lines (in case of malformed input)
+    if current_group:
+        processed_lines.append(" ".join(current_group))
+
+    # Join with double newlines
+    return "\n\n".join(processed_lines)
+
+
+def create_dataset(db_path: str, output_path: str):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+
+    try:
+        chapters = get_chapters(cursor)
+
+        with open(output_path, "w", encoding="utf-8") as f:
+            for text_en, text_zh in chapters:
+                processed_en = process_text(text_en)
+                processed_zh = process_text(text_zh)
+
+                entry = {
+                    "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
+                }
+
+                f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+    finally:
+        conn.close()
+
+
+if __name__ == "__main__":
+    DB_PATH = "parallel_texts.db"
+    OUTPUT_PATH = "datasets/dataset_v1.jsonl"
+
+    create_dataset(DB_PATH, OUTPUT_PATH)