chore: _

2025-02-15 05:41:51 +06:00
parent 9af06f6382
commit 281f54df25
5 changed files with 28 additions and 22 deletions
--- a/README.md
+++ b/README.md
@@ -2,6 +2,9 @@

 ## Installation

+just run `chmod +x setup.sh && ./setup.sh` on pod.  
+remeber to `export HF_HOME=/worspace/hf` (to the pvc on the whatever pod)
+
 1. Clone the repository:

 ```bash
--- a/config.py
+++ b/config.py
@@ -9,7 +9,7 @@ class DataConfig:
 Translation:
 {}"""
    train_split: float = 0.95
-    max_samples: int | None = 5000
+    max_samples: int | None = None


@dataclass
@@ -31,11 +31,11 @@ class TrainingConfig:
    base_model: str = "unsloth/Qwen2.5-7B"
    max_seq_length: int = 6144
    dtype: str | None = None
-    load_in_4bit: bool = True
+    load_in_4bit: bool = False

    # LoRA
-    lora_r: int = 64
-    lora_alpha: int = 128
+    lora_r: int = 16
+    lora_alpha: int = 32
    lora_dropout: float = 0
    target_modules: list[str] = field(
        default_factory=lambda: [
@@ -49,18 +49,19 @@ class TrainingConfig:
        ]
    )
    use_gradient_checkpointing: str = "unsloth"
-    random_state: int = 3407
+    random_state: int = 42
    use_rslora: bool = False
    loftq_config: dict | None = None

    # training args
    per_device_train_batch_size: int = 16
-    gradient_accumulation_steps: int = 2
-    warmup_ratio: float = 0.1
+    gradient_accumulation_steps: int = 4
+    # warmup_ratio: float = 0.1
+    warmup_steps: int = 80
    max_grad_norm: float = 1.0
-    num_train_epochs: float = 1
-    learning_rate: float = 5e-4
-    weight_decay: float = 0.01
+    num_train_epochs: float = 3
+    learning_rate: float = 1e-5
+    weight_decay: float = 0.001
    lr_scheduler_type: str = "cosine"
    logging_steps: int = 1

@@ -70,15 +71,15 @@ class TrainingConfig:
    save_total_limit: int | None = 3

    # dataset
-    dataset_num_proc: int = 4
+    dataset_num_proc: int = 8
    packing: bool = True

    # eval
    fp16_full_eval: bool = True
-    per_device_eval_batch_size: int = 8
-    eval_accumulation_steps: int = 2
+    per_device_eval_batch_size: int = 16
+    eval_accumulation_steps: int = 1
    eval_strategy: str = "steps"
-    eval_steps: int = 10
+    eval_steps: int = 100

    # output
    output_dir: str = "/workspace/output/"
--- a/main.py
+++ b/main.py
@@ -86,14 +86,16 @@ def run_sweep(base_config: TrainingConfig, dataset_path: str):
        "parameters": {
            "learning_rate": {
                "distribution": "log_uniform_values",
-                "min": 1e-5,
-                "max": 1e-4,
+                "min": 1e-7,
+                "max": 1e-5,
            },
-            "lora_r": {"values": [8]},
-            "lora_alpha": {"values": [16]},
-            "per_device_train_batch_size": {"values": [16]},
-            "gradient_accumulation_steps": {"values": [2]},
+            "lora_r": {"values": [32]},
+            "lora_alpha": {"values": [64]},
+            "per_device_train_batch_size": {"values": [32]},
+            "gradient_accumulation_steps": {"values": [4, 8]},
            "num_train_epochs": {"values": [1]},
+            "warmup_steps": {"values": [10]},
+            "max_grad_norm": {"values": [0.1, 0.3, 0.5]},
        },
        "early_terminate": {"type": "hyperband", "min_iter": 100},
    }
--- a/setup.sh
+++ b/setup.sh
@@ -1,7 +1,6 @@
 #!/bin/sh

 set -eu
-set -o pipefail

 # constants
 WORKSPACE_DIR="/workspace"
--- a/trainer.py
+++ b/trainer.py
@@ -55,7 +55,8 @@ class CustomTrainer:
            output_dir=self.config.output_dir,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
-            warmup_ratio=self.config.warmup_ratio,
+            # warmup_ratio=self.config.warmup_ratio,
+            warmup_steps=self.config.warmup_steps,
            max_grad_norm=self.config.max_grad_norm,
            num_train_epochs=self.config.num_train_epochs,
            learning_rate=self.config.learning_rate,