import json import random import sys import os def shuffle_jsonl(filename): if not os.path.exists(filename): print(f"Error: File {filename} not found") return # set seed to reproduce random.seed(42) dirname = os.path.dirname(filename) basename = os.path.basename(filename) output_file = os.path.join( dirname, f"{os.path.splitext(basename)[0]}_shuffled.jsonl" ) with open(filename, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] random.shuffle(data) with open(output_file, "w", encoding="utf-8") as f: for item in data: f.write(json.dumps(item, ensure_ascii=False) + "\n") print(f"Shuffled {len(data):,} examples") print(f"Output written to: {output_file}") if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python script.py ") sys.exit(1) shuffle_jsonl(sys.argv[1])