Files
zh-en-wn-dataset/shuffle_dataset.py
2025-02-11 03:25:03 +06:00

40 lines
950 B
Python

import json
import random
import sys
import os
def shuffle_jsonl(filename):
if not os.path.exists(filename):
print(f"Error: File {filename} not found")
return
# set seed to reproduce
random.seed(42)
dirname = os.path.dirname(filename)
basename = os.path.basename(filename)
output_file = os.path.join(
dirname, f"{os.path.splitext(basename)[0]}_shuffled.jsonl"
)
with open(filename, "r", encoding="utf-8") as f:
data = [json.loads(line) for line in f]
random.shuffle(data)
with open(output_file, "w", encoding="utf-8") as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"Shuffled {len(data):,} examples")
print(f"Output written to: {output_file}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <jsonl_file>")
sys.exit(1)
shuffle_jsonl(sys.argv[1])