chore: haha
This commit is contained in:
39
shuffle_dataset.py
Normal file
39
shuffle_dataset.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
import os
|
||||
|
||||
|
||||
def shuffle_jsonl(filename):
|
||||
if not os.path.exists(filename):
|
||||
print(f"Error: File {filename} not found")
|
||||
return
|
||||
|
||||
# set seed to reproduce
|
||||
random.seed(42)
|
||||
|
||||
dirname = os.path.dirname(filename)
|
||||
basename = os.path.basename(filename)
|
||||
output_file = os.path.join(
|
||||
dirname, f"{os.path.splitext(basename)[0]}_shuffled.jsonl"
|
||||
)
|
||||
|
||||
with open(filename, "r", encoding="utf-8") as f:
|
||||
data = [json.loads(line) for line in f]
|
||||
|
||||
random.shuffle(data)
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
for item in data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"Shuffled {len(data):,} examples")
|
||||
print(f"Output written to: {output_file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python script.py <jsonl_file>")
|
||||
sys.exit(1)
|
||||
|
||||
shuffle_jsonl(sys.argv[1])
|
||||
Reference in New Issue
Block a user