From 0481713eb6d0464a7d3da6b78b738b177246a74e Mon Sep 17 00:00:00 2001
From: kuwoyuki <kuwoyuki@cock.li>
Date: Thu, 13 Feb 2025 17:25:01 +0600
Subject: [PATCH] chore: stuff

---
 .vscode/settings.json                  |  3 ++
 scripts/avg_length.sh                  | 42 ++++++++++++++++++++++++++
 scripts/jsonl_to_json.sh               |  6 ++++
 shit_names.sh => scripts/shit_names.sh |  0
 scripts/truncate.sh                    | 23 ++++++++++++++
 train_torchtune.yaml                   |  5 ++-
 6 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 .vscode/settings.json
 create mode 100755 scripts/avg_length.sh
 create mode 100755 scripts/jsonl_to_json.sh
 rename shit_names.sh => scripts/shit_names.sh (100%)
 create mode 100755 scripts/truncate.sh
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..ff5300e
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.languageServer": "None"
+}
\ No newline at end of file
diff --git a/scripts/avg_length.sh b/scripts/avg_length.sh
new file mode 100755
index 0000000..2851842
--- /dev/null
+++ b/scripts/avg_length.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 <filename>"
+    exit 1
+fi
+
+file="$1"
+
+# Check if file exists
+if [ ! -f "$file" ]; then
+    echo "Error: File '$file' does not exist"
+    exit 1
+fi
+
+# Calculate average line length and store line lengths with their content
+avg_length=$(awk '{ 
+    total_length += length($0)
+    count++
+} 
+END { 
+    if (count > 0) printf "%.0f", total_length/count 
+}' "$file")
+
+# Find the line closest to average length
+awk -v target=$avg_length '
+{
+    curr_length = length($0)
+    diff = sqrt((curr_length - target)^2)
+    if (NR == 1 || diff < min_diff) {
+        min_diff = diff
+        closest_line = $0
+        line_num = NR
+        actual_length = curr_length
+    }
+}
+END {
+    print "Average line length: " target
+    print "Closest matching line (#" line_num "):"
+    print closest_line
+    print "Length: " actual_length
+}' "$file"
diff --git a/scripts/jsonl_to_json.sh b/scripts/jsonl_to_json.sh
new file mode 100755
index 0000000..7e38295
--- /dev/null
+++ b/scripts/jsonl_to_json.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+[ "$#" -eq 2 ] || {
+    echo "Usage: $0 input.jsonl output.json" >&2
+    exit 1
+}
+jq -s '.' <"$1" >"$2"
diff --git a/shit_names.sh b/scripts/shit_names.sh
similarity index 100%
rename from shit_names.sh
rename to scripts/shit_names.sh
diff --git a/scripts/truncate.sh b/scripts/truncate.sh
new file mode 100755
index 0000000..ef3e23a
--- /dev/null
+++ b/scripts/truncate.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+if [ $# -ne 3 ]; then
+    printf "Usage: %s input_file max_chars output_file\n" "$0"
+    printf "Example: %s data.jsonl 10000 cleaned.jsonl\n" "$0"
+    exit 1
+fi
+
+input_file=$1
+max_chars=$2
+output_file=$3
+
+# Check if input file exists
+if [ ! -f "$input_file" ]; then
+    printf "Error: Input file '%s' not found\n" "$input_file"
+    exit 1
+fi
+
+# Process the file: keep lines shorter than max_chars
+awk -v max="$max_chars" 'length($0) <= max' "$input_file" >"$output_file"
+
+printf "Processing complete. Lines longer than %s characters have been removed.\n" "$max_chars"
+printf "Result saved to: %s\n" "$output_file"
diff --git a/train_torchtune.yaml b/train_torchtune.yaml
index e7d6103..766109d 100644
--- a/train_torchtune.yaml
+++ b/train_torchtune.yaml
@@ -52,7 +52,10 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
-  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  _component_: torchtune.datasets.instruct_dataset
+  source: json
+  data_files: data/my_data.json
+  split: train
   packed: True # True increases speed
 seed: 42
 shuffle: False