From 0481713eb6d0464a7d3da6b78b738b177246a74e Mon Sep 17 00:00:00 2001 From: kuwoyuki Date: Thu, 13 Feb 2025 17:25:01 +0600 Subject: [PATCH] chore: stuff --- .vscode/settings.json | 3 ++ scripts/avg_length.sh | 42 ++++++++++++++++++++++++++ scripts/jsonl_to_json.sh | 6 ++++ shit_names.sh => scripts/shit_names.sh | 0 scripts/truncate.sh | 23 ++++++++++++++ train_torchtune.yaml | 5 ++- 6 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json create mode 100755 scripts/avg_length.sh create mode 100755 scripts/jsonl_to_json.sh rename shit_names.sh => scripts/shit_names.sh (100%) create mode 100755 scripts/truncate.sh diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ff5300e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.languageServer": "None" +} \ No newline at end of file diff --git a/scripts/avg_length.sh b/scripts/avg_length.sh new file mode 100755 index 0000000..2851842 --- /dev/null +++ b/scripts/avg_length.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +file="$1" + +# Check if file exists +if [ ! -f "$file" ]; then + echo "Error: File '$file' does not exist" + exit 1 +fi + +# Calculate average line length and store line lengths with their content +avg_length=$(awk '{ + total_length += length($0) + count++ +} +END { + if (count > 0) printf "%.0f", total_length/count +}' "$file") + +# Find the line closest to average length +awk -v target=$avg_length ' +{ + curr_length = length($0) + diff = sqrt((curr_length - target)^2) + if (NR == 1 || diff < min_diff) { + min_diff = diff + closest_line = $0 + line_num = NR + actual_length = curr_length + } +} +END { + print "Average line length: " target + print "Closest matching line (#" line_num "):" + print closest_line + print "Length: " actual_length +}' "$file" diff --git a/scripts/jsonl_to_json.sh b/scripts/jsonl_to_json.sh new file mode 100755 index 0000000..7e38295 --- /dev/null +++ b/scripts/jsonl_to_json.sh @@ -0,0 +1,6 @@ +#!/bin/sh +[ "$#" -eq 2 ] || { + echo "Usage: $0 input.jsonl output.json" >&2 + exit 1 +} +jq -s '.' <"$1" >"$2" diff --git a/shit_names.sh b/scripts/shit_names.sh similarity index 100% rename from shit_names.sh rename to scripts/shit_names.sh diff --git a/scripts/truncate.sh b/scripts/truncate.sh new file mode 100755 index 0000000..ef3e23a --- /dev/null +++ b/scripts/truncate.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +if [ $# -ne 3 ]; then + printf "Usage: %s input_file max_chars output_file\n" "$0" + printf "Example: %s data.jsonl 10000 cleaned.jsonl\n" "$0" + exit 1 +fi + +input_file=$1 +max_chars=$2 +output_file=$3 + +# Check if input file exists +if [ ! -f "$input_file" ]; then + printf "Error: Input file '%s' not found\n" "$input_file" + exit 1 +fi + +# Process the file: keep lines shorter than max_chars +awk -v max="$max_chars" 'length($0) <= max' "$input_file" >"$output_file" + +printf "Processing complete. Lines longer than %s characters have been removed.\n" "$max_chars" +printf "Result saved to: %s\n" "$output_file" diff --git a/train_torchtune.yaml b/train_torchtune.yaml index e7d6103..766109d 100644 --- a/train_torchtune.yaml +++ b/train_torchtune.yaml @@ -52,7 +52,10 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: - _component_: torchtune.datasets.alpaca_cleaned_dataset + _component_: torchtune.datasets.instruct_dataset + source: json + data_files: data/my_data.json + split: train packed: True # True increases speed seed: 42 shuffle: False