96 lines
2.7 KiB
Python
96 lines
2.7 KiB
Python
import sqlite3
|
|
import json
|
|
from typing import List, Tuple
|
|
import unicodedata
|
|
|
|
|
|
def get_chapters(cursor) -> List[Tuple[str, str]]:
|
|
"""text length < 36000"""
|
|
query = """
|
|
select text_en, text_zh
|
|
from chapters
|
|
where length(text_en) < 36000
|
|
"""
|
|
return cursor.execute(query).fetchall()
|
|
|
|
|
|
def should_join_lines(line: str) -> bool:
|
|
"""Check if line should be joined with next line based on ending"""
|
|
line = line.rstrip()
|
|
return line.endswith(",") or (
|
|
line.count('"') % 2 == 1
|
|
) # odd number of quotes means open quote
|
|
|
|
|
|
def process_text(text: str) -> str:
|
|
"""Process text by handling special markings and line breaks"""
|
|
text = unicodedata.normalize("NFKC", text)
|
|
|
|
lines = text.strip().split("\n")
|
|
processed_lines = []
|
|
current_group = []
|
|
in_marked_section = False
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
if line.startswith("#<#"):
|
|
# Start of marked section - remove marker and store first line
|
|
in_marked_section = True
|
|
first_line = line[3:].strip() # Remove #<# and whitespace
|
|
current_group.append(first_line)
|
|
continue
|
|
|
|
if line.endswith("#>#"):
|
|
# End of marked section - remove marker and store last line
|
|
last_line = line[:-3].strip() # Remove #># and whitespace
|
|
current_group.append(last_line)
|
|
# Join all collected lines with space and add to processed lines
|
|
processed_lines.append(" ".join(current_group))
|
|
current_group = []
|
|
in_marked_section = False
|
|
continue
|
|
|
|
if in_marked_section:
|
|
current_group.append(line)
|
|
else:
|
|
processed_lines.append(line)
|
|
|
|
# Handle any remaining grouped lines (in case of malformed input)
|
|
if current_group:
|
|
processed_lines.append(" ".join(current_group))
|
|
|
|
# Join with double newlines
|
|
return "\n\n".join(processed_lines)
|
|
|
|
|
|
def create_dataset(db_path: str, output_path: str):
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
chapters = get_chapters(cursor)
|
|
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
for text_en, text_zh in chapters:
|
|
processed_en = process_text(text_en)
|
|
processed_zh = process_text(text_zh)
|
|
|
|
entry = {
|
|
"text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
|
|
}
|
|
|
|
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
|
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
DB_PATH = "parallel_texts.db"
|
|
OUTPUT_PATH = "datasets/dataset_v1.jsonl"
|
|
|
|
create_dataset(DB_PATH, OUTPUT_PATH)
|