Files
zh-en-wn-dataset/14b_qwen2.yaml
2025-02-11 20:30:49 +06:00

62 lines
1.4 KiB
YAML

model: "mlx-community/Qwen2.5-14B-bf16"
train: true
fine_tune_type: "lora"
data: "/path/to/training/data"
seed: 42
# Number of layers to fine-tune
num_layers: 18
# Minibatch size.
batch_size: 4
# (28425 * 3) / 4 = 21319
iters: 21319
# Number of validation batches, -1 uses the entire validation set.
val_batches: 50
# Adam learning rate.
learning_rate: 5e-5
# Number of training steps between loss reporting.
steps_per_report: 10
# Number of training steps between validations.
steps_per_eval: 1000
# Maximum sequence length.
max_seq_length: 16384
# Load path to resume training with the given adapter weights.
resume_adapter_file: null
# Save/load path for the trained adapter weights.
adapter_path: "adapters"
# Save the model every N iterations.
save_every: 1000
# Evaluate on the test set after training
test: false
# Number of test set batches, -1 uses the entire test set.
test_batches: 100
# Use gradient checkpointing to reduce memory use.
grad_checkpoint: true
# LoRA specific settings
lora_parameters:
# The layer keys to apply LoRA to.
# These will be applied for the last lora_layers
keys: [
"self_attn.q_proj",
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.o_proj",
"mlp.up_proj",
"mlp.down_proj",
"mlp.gate_proj"
]
rank: 16
scale: 16.0
dropout: 0.05
lr_schedule:
name: cosine_decay
warmup: 500
warmup_init: 1e-7
arguments: [5e-5, 21319, 1e-7]