chore: _
This commit is contained in:
@@ -95,13 +95,21 @@ def create_datasets(
|
||||
|
||||
# Helper function to write datasets
|
||||
def write_dataset(chapters: List[Tuple[str, str]], filepath: str):
|
||||
template = "### Instruction:\nTranslate the following Chinese text to English:\n\n### Input:\n{}\n\n### Response:\n{}"
|
||||
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
for text_en, text_zh in chapters:
|
||||
processed_en = process_text(text_en)
|
||||
processed_zh = process_text(text_zh)
|
||||
|
||||
# entry = {
|
||||
# "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
|
||||
# }
|
||||
# entry = {"text": template.format(processed_zh, processed_en)}
|
||||
entry = {
|
||||
"text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
|
||||
"instruction": "Translate the following Chinese text to English:",
|
||||
"input": processed_zh,
|
||||
"output": processed_en,
|
||||
}
|
||||
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user