chore: stuff

2025-02-13 17:25:01 +06:00
parent 66e61c9aad
commit 0481713eb6
6 changed files with 78 additions and 1 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
 {
    "python.languageServer": "None"
 }
--- a/scripts/avg_length.sh
+++ b/scripts/avg_length.sh
@@ -0,0 +1,42 @@
 #!/bin/bash
 if [ $# -ne 1 ]; then
    echo "Usage: $0 <filename>"
    exit 1
 fi
 file="$1"
 # Check if file exists
 if [ ! -f "$file" ]; then
    echo "Error: File '$file' does not exist"
    exit 1
 fi
 # Calculate average line length and store line lengths with their content
 avg_length=$(awk '{ 
    total_length += length($0)
    count++
 } 
 END { 
    if (count > 0) printf "%.0f", total_length/count 
 }' "$file")
 # Find the line closest to average length
 awk -v target=$avg_length '
 {
    curr_length = length($0)
    diff = sqrt((curr_length - target)^2)
    if (NR == 1 || diff < min_diff) {
        min_diff = diff
        closest_line = $0
        line_num = NR
        actual_length = curr_length
    }
 }
 END {
    print "Average line length: " target
    print "Closest matching line (#" line_num "):"
    print closest_line
    print "Length: " actual_length
 }' "$file"
--- a/scripts/jsonl_to_json.sh
+++ b/scripts/jsonl_to_json.sh
@@ -0,0 +1,6 @@
 #!/bin/sh
 [ "$#" -eq 2 ] || {
    echo "Usage: $0 input.jsonl output.json" >&2
    exit 1
 }
 jq -s '.' <"$1" >"$2"
--- a/scripts/shit_names.sh
+++ b/scripts/shit_names.sh
--- a/scripts/truncate.sh
+++ b/scripts/truncate.sh
@@ -0,0 +1,23 @@
 #!/bin/sh
 if [ $# -ne 3 ]; then
    printf "Usage: %s input_file max_chars output_file\n" "$0"
    printf "Example: %s data.jsonl 10000 cleaned.jsonl\n" "$0"
    exit 1
 fi
 input_file=$1
 max_chars=$2
 output_file=$3
 # Check if input file exists
 if [ ! -f "$input_file" ]; then
    printf "Error: Input file '%s' not found\n" "$input_file"
    exit 1
 fi
 # Process the file: keep lines shorter than max_chars
 awk -v max="$max_chars" 'length($0) <= max' "$input_file" >"$output_file"
 printf "Processing complete. Lines longer than %s characters have been removed.\n" "$max_chars"
 printf "Result saved to: %s\n" "$output_file"
--- a/train_torchtune.yaml
+++ b/train_torchtune.yaml
@@ -52,7 +52,10 @@ resume_from_checkpoint: False
 # Dataset and Sampler
 dataset:
-  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  _component_: torchtune.datasets.instruct_dataset
  source: json
  data_files: data/my_data.json
  split: train
  packed: True # True increases speed
 seed: 42
 shuffle: False