chore: _
This commit is contained in:
61
14b_qwen2.yaml
Normal file
61
14b_qwen2.yaml
Normal file
@@ -0,0 +1,61 @@
|
||||
model: "mlx-community/Qwen2.5-14B-bf16"
|
||||
train: true
|
||||
fine_tune_type: "lora"
|
||||
data: "/path/to/training/data"
|
||||
seed: 42
|
||||
|
||||
# Number of layers to fine-tune
|
||||
num_layers: 18
|
||||
# Minibatch size.
|
||||
batch_size: 4
|
||||
# (28425 * 3) / 4 = 21319
|
||||
iters: 21319
|
||||
# Number of validation batches, -1 uses the entire validation set.
|
||||
val_batches: 50
|
||||
# Adam learning rate.
|
||||
learning_rate: 5e-5
|
||||
# Number of training steps between loss reporting.
|
||||
steps_per_report: 10
|
||||
# Number of training steps between validations.
|
||||
steps_per_eval: 1000
|
||||
# Maximum sequence length.
|
||||
max_seq_length: 16384
|
||||
|
||||
# Load path to resume training with the given adapter weights.
|
||||
resume_adapter_file: null
|
||||
# Save/load path for the trained adapter weights.
|
||||
adapter_path: "adapters"
|
||||
# Save the model every N iterations.
|
||||
save_every: 1000
|
||||
|
||||
# Evaluate on the test set after training
|
||||
test: false
|
||||
|
||||
# Number of test set batches, -1 uses the entire test set.
|
||||
test_batches: 100
|
||||
|
||||
# Use gradient checkpointing to reduce memory use.
|
||||
grad_checkpoint: true
|
||||
|
||||
# LoRA specific settings
|
||||
lora_parameters:
|
||||
# The layer keys to apply LoRA to.
|
||||
# These will be applied for the last lora_layers
|
||||
keys: [
|
||||
"self_attn.q_proj",
|
||||
"self_attn.k_proj",
|
||||
"self_attn.v_proj",
|
||||
"self_attn.o_proj",
|
||||
"mlp.up_proj",
|
||||
"mlp.down_proj",
|
||||
"mlp.gate_proj"
|
||||
]
|
||||
rank: 16
|
||||
scale: 16.0
|
||||
dropout: 0.05
|
||||
|
||||
lr_schedule:
|
||||
name: cosine_decay
|
||||
warmup: 500
|
||||
warmup_init: 1e-7
|
||||
arguments: [5e-5, 21319, 1e-7]
|
||||
Reference in New Issue
Block a user