62 lines
1.4 KiB
YAML
62 lines
1.4 KiB
YAML
model: "mlx-community/Qwen2.5-14B-bf16"
|
|
train: true
|
|
fine_tune_type: "lora"
|
|
data: "/path/to/training/data"
|
|
seed: 42
|
|
|
|
# Number of layers to fine-tune
|
|
num_layers: 18
|
|
# Minibatch size.
|
|
batch_size: 4
|
|
# (28425 * 3) / 4 = 21319
|
|
iters: 21319
|
|
# Number of validation batches, -1 uses the entire validation set.
|
|
val_batches: 50
|
|
# Adam learning rate.
|
|
learning_rate: 5e-5
|
|
# Number of training steps between loss reporting.
|
|
steps_per_report: 10
|
|
# Number of training steps between validations.
|
|
steps_per_eval: 1000
|
|
# Maximum sequence length.
|
|
max_seq_length: 16384
|
|
|
|
# Load path to resume training with the given adapter weights.
|
|
resume_adapter_file: null
|
|
# Save/load path for the trained adapter weights.
|
|
adapter_path: "adapters"
|
|
# Save the model every N iterations.
|
|
save_every: 1000
|
|
|
|
# Evaluate on the test set after training
|
|
test: false
|
|
|
|
# Number of test set batches, -1 uses the entire test set.
|
|
test_batches: 100
|
|
|
|
# Use gradient checkpointing to reduce memory use.
|
|
grad_checkpoint: true
|
|
|
|
# LoRA specific settings
|
|
lora_parameters:
|
|
# The layer keys to apply LoRA to.
|
|
# These will be applied for the last lora_layers
|
|
keys: [
|
|
"self_attn.q_proj",
|
|
"self_attn.k_proj",
|
|
"self_attn.v_proj",
|
|
"self_attn.o_proj",
|
|
"mlp.up_proj",
|
|
"mlp.down_proj",
|
|
"mlp.gate_proj"
|
|
]
|
|
rank: 16
|
|
scale: 16.0
|
|
dropout: 0.05
|
|
|
|
lr_schedule:
|
|
name: cosine_decay
|
|
warmup: 500
|
|
warmup_init: 1e-7
|
|
arguments: [5e-5, 21319, 1e-7]
|