This commit is contained in:
2025-02-11 20:30:49 +06:00
parent befdc9c945
commit c7609f8328
5 changed files with 308 additions and 2 deletions

View File

@@ -95,13 +95,21 @@ def create_datasets(
# Helper function to write datasets
def write_dataset(chapters: List[Tuple[str, str]], filepath: str):
template = "### Instruction:\nTranslate the following Chinese text to English:\n\n### Input:\n{}\n\n### Response:\n{}"
with open(filepath, "w", encoding="utf-8") as f:
for text_en, text_zh in chapters:
processed_en = process_text(text_en)
processed_zh = process_text(text_zh)
# entry = {
# "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
# }
# entry = {"text": template.format(processed_zh, processed_en)}
entry = {
"text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
"instruction": "Translate the following Chinese text to English:",
"input": processed_zh,
"output": processed_en,
}
f.write(json.dumps(entry, ensure_ascii=False) + "\n")