chore: stuff

This commit is contained in:
2025-02-13 17:25:01 +06:00
parent 66e61c9aad
commit 0481713eb6
6 changed files with 78 additions and 1 deletions

3
.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,3 @@
{
"python.languageServer": "None"
}

42
scripts/avg_length.sh Executable file
View File

@@ -0,0 +1,42 @@
#!/bin/bash
if [ $# -ne 1 ]; then
echo "Usage: $0 <filename>"
exit 1
fi
file="$1"
# Check if file exists
if [ ! -f "$file" ]; then
echo "Error: File '$file' does not exist"
exit 1
fi
# Calculate average line length and store line lengths with their content
avg_length=$(awk '{
total_length += length($0)
count++
}
END {
if (count > 0) printf "%.0f", total_length/count
}' "$file")
# Find the line closest to average length
awk -v target=$avg_length '
{
curr_length = length($0)
diff = sqrt((curr_length - target)^2)
if (NR == 1 || diff < min_diff) {
min_diff = diff
closest_line = $0
line_num = NR
actual_length = curr_length
}
}
END {
print "Average line length: " target
print "Closest matching line (#" line_num "):"
print closest_line
print "Length: " actual_length
}' "$file"

6
scripts/jsonl_to_json.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/bin/sh
[ "$#" -eq 2 ] || {
echo "Usage: $0 input.jsonl output.json" >&2
exit 1
}
jq -s '.' <"$1" >"$2"

23
scripts/truncate.sh Executable file
View File

@@ -0,0 +1,23 @@
#!/bin/sh
if [ $# -ne 3 ]; then
printf "Usage: %s input_file max_chars output_file\n" "$0"
printf "Example: %s data.jsonl 10000 cleaned.jsonl\n" "$0"
exit 1
fi
input_file=$1
max_chars=$2
output_file=$3
# Check if input file exists
if [ ! -f "$input_file" ]; then
printf "Error: Input file '%s' not found\n" "$input_file"
exit 1
fi
# Process the file: keep lines shorter than max_chars
awk -v max="$max_chars" 'length($0) <= max' "$input_file" >"$output_file"
printf "Processing complete. Lines longer than %s characters have been removed.\n" "$max_chars"
printf "Result saved to: %s\n" "$output_file"

View File

@@ -52,7 +52,10 @@ resume_from_checkpoint: False
# Dataset and Sampler
dataset:
_component_: torchtune.datasets.alpaca_cleaned_dataset
_component_: torchtune.datasets.instruct_dataset
source: json
data_files: data/my_data.json
split: train
packed: True # True increases speed
seed: 42
shuffle: False