chore: stuff
This commit is contained in:
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"python.languageServer": "None"
|
||||||
|
}
|
||||||
42
scripts/avg_length.sh
Executable file
42
scripts/avg_length.sh
Executable file
@@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# -ne 1 ]; then
|
||||||
|
echo "Usage: $0 <filename>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
file="$1"
|
||||||
|
|
||||||
|
# Check if file exists
|
||||||
|
if [ ! -f "$file" ]; then
|
||||||
|
echo "Error: File '$file' does not exist"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Calculate average line length and store line lengths with their content
|
||||||
|
avg_length=$(awk '{
|
||||||
|
total_length += length($0)
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
END {
|
||||||
|
if (count > 0) printf "%.0f", total_length/count
|
||||||
|
}' "$file")
|
||||||
|
|
||||||
|
# Find the line closest to average length
|
||||||
|
awk -v target=$avg_length '
|
||||||
|
{
|
||||||
|
curr_length = length($0)
|
||||||
|
diff = sqrt((curr_length - target)^2)
|
||||||
|
if (NR == 1 || diff < min_diff) {
|
||||||
|
min_diff = diff
|
||||||
|
closest_line = $0
|
||||||
|
line_num = NR
|
||||||
|
actual_length = curr_length
|
||||||
|
}
|
||||||
|
}
|
||||||
|
END {
|
||||||
|
print "Average line length: " target
|
||||||
|
print "Closest matching line (#" line_num "):"
|
||||||
|
print closest_line
|
||||||
|
print "Length: " actual_length
|
||||||
|
}' "$file"
|
||||||
6
scripts/jsonl_to_json.sh
Executable file
6
scripts/jsonl_to_json.sh
Executable file
@@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
[ "$#" -eq 2 ] || {
|
||||||
|
echo "Usage: $0 input.jsonl output.json" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
jq -s '.' <"$1" >"$2"
|
||||||
23
scripts/truncate.sh
Executable file
23
scripts/truncate.sh
Executable file
@@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
if [ $# -ne 3 ]; then
|
||||||
|
printf "Usage: %s input_file max_chars output_file\n" "$0"
|
||||||
|
printf "Example: %s data.jsonl 10000 cleaned.jsonl\n" "$0"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
input_file=$1
|
||||||
|
max_chars=$2
|
||||||
|
output_file=$3
|
||||||
|
|
||||||
|
# Check if input file exists
|
||||||
|
if [ ! -f "$input_file" ]; then
|
||||||
|
printf "Error: Input file '%s' not found\n" "$input_file"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Process the file: keep lines shorter than max_chars
|
||||||
|
awk -v max="$max_chars" 'length($0) <= max' "$input_file" >"$output_file"
|
||||||
|
|
||||||
|
printf "Processing complete. Lines longer than %s characters have been removed.\n" "$max_chars"
|
||||||
|
printf "Result saved to: %s\n" "$output_file"
|
||||||
@@ -52,7 +52,10 @@ resume_from_checkpoint: False
|
|||||||
|
|
||||||
# Dataset and Sampler
|
# Dataset and Sampler
|
||||||
dataset:
|
dataset:
|
||||||
_component_: torchtune.datasets.alpaca_cleaned_dataset
|
_component_: torchtune.datasets.instruct_dataset
|
||||||
|
source: json
|
||||||
|
data_files: data/my_data.json
|
||||||
|
split: train
|
||||||
packed: True # True increases speed
|
packed: True # True increases speed
|
||||||
seed: 42
|
seed: 42
|
||||||
shuffle: False
|
shuffle: False
|
||||||
|
|||||||
Reference in New Issue
Block a user