40 lines
950 B
Python
40 lines
950 B
Python
import json
|
|
import random
|
|
import sys
|
|
import os
|
|
|
|
|
|
def shuffle_jsonl(filename):
|
|
if not os.path.exists(filename):
|
|
print(f"Error: File {filename} not found")
|
|
return
|
|
|
|
# set seed to reproduce
|
|
random.seed(42)
|
|
|
|
dirname = os.path.dirname(filename)
|
|
basename = os.path.basename(filename)
|
|
output_file = os.path.join(
|
|
dirname, f"{os.path.splitext(basename)[0]}_shuffled.jsonl"
|
|
)
|
|
|
|
with open(filename, "r", encoding="utf-8") as f:
|
|
data = [json.loads(line) for line in f]
|
|
|
|
random.shuffle(data)
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
for item in data:
|
|
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
|
|
print(f"Shuffled {len(data):,} examples")
|
|
print(f"Output written to: {output_file}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 2:
|
|
print("Usage: python script.py <jsonl_file>")
|
|
sys.exit(1)
|
|
|
|
shuffle_jsonl(sys.argv[1])
|