chore: _

2025-02-11 20:30:49 +06:00
parent befdc9c945
commit c7609f8328
5 changed files with 308 additions and 2 deletions
--- a/14b_qwen2.yaml
+++ b/14b_qwen2.yaml
@@ -0,0 +1,61 @@
+model: "mlx-community/Qwen2.5-14B-bf16"
+train: true
+fine_tune_type: "lora"
+data: "/path/to/training/data"
+seed: 42
+
+# Number of layers to fine-tune
+num_layers: 18
+# Minibatch size.
+batch_size: 4
+# (28425 * 3) / 4 = 21319
+iters: 21319
+# Number of validation batches, -1 uses the entire validation set.
+val_batches: 50
+# Adam learning rate.
+learning_rate: 5e-5
+# Number of training steps between loss reporting.
+steps_per_report: 10
+# Number of training steps between validations.
+steps_per_eval: 1000
+# Maximum sequence length.
+max_seq_length: 16384
+
+# Load path to resume training with the given adapter weights.
+resume_adapter_file: null
+# Save/load path for the trained adapter weights.
+adapter_path: "adapters"
+# Save the model every N iterations.
+save_every: 1000
+
+# Evaluate on the test set after training
+test: false
+
+# Number of test set batches, -1 uses the entire test set.
+test_batches: 100
+
+# Use gradient checkpointing to reduce memory use.
+grad_checkpoint: true
+
+# LoRA specific settings
+lora_parameters:
+  # The layer keys to apply LoRA to.
+  # These will be applied for the last lora_layers
+  keys: [
+    "self_attn.q_proj",
+    "self_attn.k_proj",
+    "self_attn.v_proj", 
+    "self_attn.o_proj",
+    "mlp.up_proj",
+    "mlp.down_proj",
+    "mlp.gate_proj" 
+  ]
+  rank: 16
+  scale: 16.0
+  dropout: 0.05
+
+lr_schedule:
+  name: cosine_decay
+  warmup: 500
+  warmup_init: 1e-7
+  arguments: [5e-5, 21319, 1e-7]