This commit is contained in:
2025-02-11 20:30:49 +06:00
parent befdc9c945
commit c7609f8328
5 changed files with 308 additions and 2 deletions

1
.gitignore vendored
View File

@@ -8,3 +8,4 @@ train.en
train.zh train.zh
valid*.en valid*.en
valid*.zh valid*.zh
token

61
14b_qwen2.yaml Normal file
View File

@@ -0,0 +1,61 @@
model: "mlx-community/Qwen2.5-14B-bf16"
train: true
fine_tune_type: "lora"
data: "/path/to/training/data"
seed: 42
# Number of layers to fine-tune
num_layers: 18
# Minibatch size.
batch_size: 4
# (28425 * 3) / 4 = 21319
iters: 21319
# Number of validation batches, -1 uses the entire validation set.
val_batches: 50
# Adam learning rate.
learning_rate: 5e-5
# Number of training steps between loss reporting.
steps_per_report: 10
# Number of training steps between validations.
steps_per_eval: 1000
# Maximum sequence length.
max_seq_length: 16384
# Load path to resume training with the given adapter weights.
resume_adapter_file: null
# Save/load path for the trained adapter weights.
adapter_path: "adapters"
# Save the model every N iterations.
save_every: 1000
# Evaluate on the test set after training
test: false
# Number of test set batches, -1 uses the entire test set.
test_batches: 100
# Use gradient checkpointing to reduce memory use.
grad_checkpoint: true
# LoRA specific settings
lora_parameters:
# The layer keys to apply LoRA to.
# These will be applied for the last lora_layers
keys: [
"self_attn.q_proj",
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.o_proj",
"mlp.up_proj",
"mlp.down_proj",
"mlp.gate_proj"
]
rank: 16
scale: 16.0
dropout: 0.05
lr_schedule:
name: cosine_decay
warmup: 500
warmup_init: 1e-7
arguments: [5e-5, 21319, 1e-7]

61
7b_qwen2.yaml Normal file
View File

@@ -0,0 +1,61 @@
model: "mlx-community/Qwen2.5-7B-bf16"
train: true
fine_tune_type: "lora"
data: "/path/to/training/data"
seed: 42
# Number of layers to fine-tune
num_layers: 8
# Minibatch size.
batch_size: 8
# (28425 * 3) / 8 = 10660 (adjusted for new batch size)
iters: 3553
# Number of validation batches, -1 uses the entire validation set.
val_batches: 50
# Adam learning rate.
learning_rate: 3e-5
# Number of training steps between loss reporting.
steps_per_report: 10
# Number of training steps between validations.
steps_per_eval: 1000
# Maximum sequence length.
max_seq_length: 16384
# Load path to resume training with the given adapter weights.
resume_adapter_file: null
# Save/load path for the trained adapter weights.
adapter_path: "adapters"
# Save the model every N iterations.
save_every: 1000
# Evaluate on the test set after training
test: false
# Number of test set batches, -1 uses the entire test set.
test_batches: 100
# Use gradient checkpointing to reduce memory use.
grad_checkpoint: true
# LoRA specific settings
lora_parameters:
# The layer keys to apply LoRA to.
# These will be applied for the last lora_layers
keys: [
"self_attn.q_proj",
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.o_proj",
"mlp.up_proj",
"mlp.down_proj",
"mlp.gate_proj"
]
rank: 16
scale: 16.0
dropout: 0.05
lr_schedule:
name: cosine_decay
warmup: 500
warmup_init: 1e-7
arguments: [5e-5, 3553, 1e-7]

View File

@@ -95,13 +95,21 @@ def create_datasets(
# Helper function to write datasets # Helper function to write datasets
def write_dataset(chapters: List[Tuple[str, str]], filepath: str): def write_dataset(chapters: List[Tuple[str, str]], filepath: str):
template = "### Instruction:\nTranslate the following Chinese text to English:\n\n### Input:\n{}\n\n### Response:\n{}"
with open(filepath, "w", encoding="utf-8") as f: with open(filepath, "w", encoding="utf-8") as f:
for text_en, text_zh in chapters: for text_en, text_zh in chapters:
processed_en = process_text(text_en) processed_en = process_text(text_en)
processed_zh = process_text(text_zh) processed_zh = process_text(text_zh)
# entry = {
# "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
# }
# entry = {"text": template.format(processed_zh, processed_en)}
entry = { entry = {
"text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>" "instruction": "Translate the following Chinese text to English:",
"input": processed_zh,
"output": processed_en,
} }
f.write(json.dumps(entry, ensure_ascii=False) + "\n") f.write(json.dumps(entry, ensure_ascii=False) + "\n")

175
train.py Normal file
View File

@@ -0,0 +1,175 @@
import argparse
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
def load_data(path):
if "drive.google.com" in str(path):
try:
import gdown
local_path = "downloaded_dataset.json"
gdown.download(url=path, output=local_path, fuzzy=True)
dataset_path = local_path
except ImportError:
raise ImportError("Please install gdown: pip install gdown")
else:
dataset_path = path
dataset = load_dataset("json", data_files=dataset_path, split="train")
return dataset
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--base_model",
type=str,
default="unsloth/Qwen2.5-7B",
required=False,
help="Base model to fine-tune. Default: unsloth/Qwen2.5-7B",
)
parser.add_argument(
"--dataset",
type=str,
required=True,
help="Dataset to train on",
)
parser.add_argument("--hub_token", type=str, required=False, help="hf token")
args = parser.parse_args()
max_seq_length = 16384 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.base_model,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model,
r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
lora_alpha=16,
lora_dropout=0, # Supports any, but = 0 is optimized
bias="none", # Supports any, but = "none" is optimized
use_gradient_checkpointing="unsloth",
random_state=3407,
max_seq_length=max_seq_length,
use_rslora=False,
loftq_config=None,
)
alpaca_prompt = """### Instruction:
{}
### Input:
{}
### Response:
{}"""
DATASET_PATH = args.dataset
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
EOS_TOKEN = tokenizer.eos_token
print(f"EOS Token: {EOS_TOKEN}")
def formatting_func(example):
instructions = example["instruction"]
inputs = example["input"]
outputs = example["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
texts.append(text)
return {
"text": texts,
}
dataset = dataset.map(formatting_func, batched=True)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.05,
max_grad_norm=1.0,
num_train_epochs=1,
learning_rate=2e-5,
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
logging_steps=10,
optim="adamw_8bit",
weight_decay=0.1,
lr_scheduler_type="linear",
seed=3407,
output_dir="/output/",
report_to=None,
),
)
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
trainer_stats = trainer.train()
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
try:
if args.hf_token:
model.push_to_hub_gguf(
"kuwoyuki/qwen2.5-zh-en-wntl-7b-gguf",
tokenizer,
quantization_method="q4_k_m",
token=args.hf_token,
)
model.save_pretrained_gguf(
"/output/gguf-quant/", tokenizer, quantization_method="q4_k_m"
)
except Exception as e:
print(f"Error: {e}")
print(f"✅ Done.")
if __name__ == "__main__":
main()