model: "mlx-community/Qwen2.5-14B-bf16"
train: true
fine_tune_type: "lora"
data: "/path/to/training/data"
seed: 42

# Number of layers to fine-tune
num_layers: 18
# Minibatch size.
batch_size: 4
# (28425 * 3) / 4 = 21319
iters: 21319
# Number of validation batches, -1 uses the entire validation set.
val_batches: 50
# Adam learning rate.
learning_rate: 5e-5
# Number of training steps between loss reporting.
steps_per_report: 10
# Number of training steps between validations.
steps_per_eval: 1000
# Maximum sequence length.
max_seq_length: 16384

# Load path to resume training with the given adapter weights.
resume_adapter_file: null
# Save/load path for the trained adapter weights.
adapter_path: "adapters"
# Save the model every N iterations.
save_every: 1000

# Evaluate on the test set after training
test: false

# Number of test set batches, -1 uses the entire test set.
test_batches: 100

# Use gradient checkpointing to reduce memory use.
grad_checkpoint: true

# LoRA specific settings
lora_parameters:
  # The layer keys to apply LoRA to.
  # These will be applied for the last lora_layers
  keys: [
    "self_attn.q_proj",
    "self_attn.k_proj",
    "self_attn.v_proj", 
    "self_attn.o_proj",
    "mlp.up_proj",
    "mlp.down_proj",
    "mlp.gate_proj" 
  ]
  rank: 16
  scale: 16.0
  dropout: 0.05

lr_schedule:
  name: cosine_decay
  warmup: 500
  warmup_init: 1e-7
  arguments: [5e-5, 21319, 1e-7]