model: "mlx-community/Qwen2.5-14B-bf16" train: true fine_tune_type: "lora" data: "/path/to/training/data" seed: 42 # Number of layers to fine-tune num_layers: 18 # Minibatch size. batch_size: 4 # (28425 * 3) / 4 = 21319 iters: 21319 # Number of validation batches, -1 uses the entire validation set. val_batches: 50 # Adam learning rate. learning_rate: 5e-5 # Number of training steps between loss reporting. steps_per_report: 10 # Number of training steps between validations. steps_per_eval: 1000 # Maximum sequence length. max_seq_length: 16384 # Load path to resume training with the given adapter weights. resume_adapter_file: null # Save/load path for the trained adapter weights. adapter_path: "adapters" # Save the model every N iterations. save_every: 1000 # Evaluate on the test set after training test: false # Number of test set batches, -1 uses the entire test set. test_batches: 100 # Use gradient checkpointing to reduce memory use. grad_checkpoint: true # LoRA specific settings lora_parameters: # The layer keys to apply LoRA to. # These will be applied for the last lora_layers keys: [ "self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj", "mlp.up_proj", "mlp.down_proj", "mlp.gate_proj" ] rank: 16 scale: 16.0 dropout: 0.05 lr_schedule: name: cosine_decay warmup: 500 warmup_init: 1e-7 arguments: [5e-5, 21319, 1e-7]