chore: _
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -7,4 +7,5 @@ datasets/
|
|||||||
train.en
|
train.en
|
||||||
train.zh
|
train.zh
|
||||||
valid*.en
|
valid*.en
|
||||||
valid*.zh
|
valid*.zh
|
||||||
|
token
|
||||||
61
14b_qwen2.yaml
Normal file
61
14b_qwen2.yaml
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
model: "mlx-community/Qwen2.5-14B-bf16"
|
||||||
|
train: true
|
||||||
|
fine_tune_type: "lora"
|
||||||
|
data: "/path/to/training/data"
|
||||||
|
seed: 42
|
||||||
|
|
||||||
|
# Number of layers to fine-tune
|
||||||
|
num_layers: 18
|
||||||
|
# Minibatch size.
|
||||||
|
batch_size: 4
|
||||||
|
# (28425 * 3) / 4 = 21319
|
||||||
|
iters: 21319
|
||||||
|
# Number of validation batches, -1 uses the entire validation set.
|
||||||
|
val_batches: 50
|
||||||
|
# Adam learning rate.
|
||||||
|
learning_rate: 5e-5
|
||||||
|
# Number of training steps between loss reporting.
|
||||||
|
steps_per_report: 10
|
||||||
|
# Number of training steps between validations.
|
||||||
|
steps_per_eval: 1000
|
||||||
|
# Maximum sequence length.
|
||||||
|
max_seq_length: 16384
|
||||||
|
|
||||||
|
# Load path to resume training with the given adapter weights.
|
||||||
|
resume_adapter_file: null
|
||||||
|
# Save/load path for the trained adapter weights.
|
||||||
|
adapter_path: "adapters"
|
||||||
|
# Save the model every N iterations.
|
||||||
|
save_every: 1000
|
||||||
|
|
||||||
|
# Evaluate on the test set after training
|
||||||
|
test: false
|
||||||
|
|
||||||
|
# Number of test set batches, -1 uses the entire test set.
|
||||||
|
test_batches: 100
|
||||||
|
|
||||||
|
# Use gradient checkpointing to reduce memory use.
|
||||||
|
grad_checkpoint: true
|
||||||
|
|
||||||
|
# LoRA specific settings
|
||||||
|
lora_parameters:
|
||||||
|
# The layer keys to apply LoRA to.
|
||||||
|
# These will be applied for the last lora_layers
|
||||||
|
keys: [
|
||||||
|
"self_attn.q_proj",
|
||||||
|
"self_attn.k_proj",
|
||||||
|
"self_attn.v_proj",
|
||||||
|
"self_attn.o_proj",
|
||||||
|
"mlp.up_proj",
|
||||||
|
"mlp.down_proj",
|
||||||
|
"mlp.gate_proj"
|
||||||
|
]
|
||||||
|
rank: 16
|
||||||
|
scale: 16.0
|
||||||
|
dropout: 0.05
|
||||||
|
|
||||||
|
lr_schedule:
|
||||||
|
name: cosine_decay
|
||||||
|
warmup: 500
|
||||||
|
warmup_init: 1e-7
|
||||||
|
arguments: [5e-5, 21319, 1e-7]
|
||||||
61
7b_qwen2.yaml
Normal file
61
7b_qwen2.yaml
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
model: "mlx-community/Qwen2.5-7B-bf16"
|
||||||
|
train: true
|
||||||
|
fine_tune_type: "lora"
|
||||||
|
data: "/path/to/training/data"
|
||||||
|
seed: 42
|
||||||
|
|
||||||
|
# Number of layers to fine-tune
|
||||||
|
num_layers: 8
|
||||||
|
# Minibatch size.
|
||||||
|
batch_size: 8
|
||||||
|
# (28425 * 3) / 8 = 10660 (adjusted for new batch size)
|
||||||
|
iters: 3553
|
||||||
|
# Number of validation batches, -1 uses the entire validation set.
|
||||||
|
val_batches: 50
|
||||||
|
# Adam learning rate.
|
||||||
|
learning_rate: 3e-5
|
||||||
|
# Number of training steps between loss reporting.
|
||||||
|
steps_per_report: 10
|
||||||
|
# Number of training steps between validations.
|
||||||
|
steps_per_eval: 1000
|
||||||
|
# Maximum sequence length.
|
||||||
|
max_seq_length: 16384
|
||||||
|
|
||||||
|
# Load path to resume training with the given adapter weights.
|
||||||
|
resume_adapter_file: null
|
||||||
|
# Save/load path for the trained adapter weights.
|
||||||
|
adapter_path: "adapters"
|
||||||
|
# Save the model every N iterations.
|
||||||
|
save_every: 1000
|
||||||
|
|
||||||
|
# Evaluate on the test set after training
|
||||||
|
test: false
|
||||||
|
|
||||||
|
# Number of test set batches, -1 uses the entire test set.
|
||||||
|
test_batches: 100
|
||||||
|
|
||||||
|
# Use gradient checkpointing to reduce memory use.
|
||||||
|
grad_checkpoint: true
|
||||||
|
|
||||||
|
# LoRA specific settings
|
||||||
|
lora_parameters:
|
||||||
|
# The layer keys to apply LoRA to.
|
||||||
|
# These will be applied for the last lora_layers
|
||||||
|
keys: [
|
||||||
|
"self_attn.q_proj",
|
||||||
|
"self_attn.k_proj",
|
||||||
|
"self_attn.v_proj",
|
||||||
|
"self_attn.o_proj",
|
||||||
|
"mlp.up_proj",
|
||||||
|
"mlp.down_proj",
|
||||||
|
"mlp.gate_proj"
|
||||||
|
]
|
||||||
|
rank: 16
|
||||||
|
scale: 16.0
|
||||||
|
dropout: 0.05
|
||||||
|
|
||||||
|
lr_schedule:
|
||||||
|
name: cosine_decay
|
||||||
|
warmup: 500
|
||||||
|
warmup_init: 1e-7
|
||||||
|
arguments: [5e-5, 3553, 1e-7]
|
||||||
@@ -95,13 +95,21 @@ def create_datasets(
|
|||||||
|
|
||||||
# Helper function to write datasets
|
# Helper function to write datasets
|
||||||
def write_dataset(chapters: List[Tuple[str, str]], filepath: str):
|
def write_dataset(chapters: List[Tuple[str, str]], filepath: str):
|
||||||
|
template = "### Instruction:\nTranslate the following Chinese text to English:\n\n### Input:\n{}\n\n### Response:\n{}"
|
||||||
|
|
||||||
with open(filepath, "w", encoding="utf-8") as f:
|
with open(filepath, "w", encoding="utf-8") as f:
|
||||||
for text_en, text_zh in chapters:
|
for text_en, text_zh in chapters:
|
||||||
processed_en = process_text(text_en)
|
processed_en = process_text(text_en)
|
||||||
processed_zh = process_text(text_zh)
|
processed_zh = process_text(text_zh)
|
||||||
|
|
||||||
|
# entry = {
|
||||||
|
# "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
|
||||||
|
# }
|
||||||
|
# entry = {"text": template.format(processed_zh, processed_en)}
|
||||||
entry = {
|
entry = {
|
||||||
"text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
|
"instruction": "Translate the following Chinese text to English:",
|
||||||
|
"input": processed_zh,
|
||||||
|
"output": processed_en,
|
||||||
}
|
}
|
||||||
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
|||||||
175
train.py
Normal file
175
train.py
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
import argparse
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from unsloth import FastLanguageModel
|
||||||
|
from datasets import load_dataset
|
||||||
|
from trl import SFTTrainer
|
||||||
|
from transformers import TrainingArguments
|
||||||
|
|
||||||
|
|
||||||
|
def load_data(path):
|
||||||
|
if "drive.google.com" in str(path):
|
||||||
|
try:
|
||||||
|
import gdown
|
||||||
|
|
||||||
|
local_path = "downloaded_dataset.json"
|
||||||
|
gdown.download(url=path, output=local_path, fuzzy=True)
|
||||||
|
dataset_path = local_path
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("Please install gdown: pip install gdown")
|
||||||
|
else:
|
||||||
|
dataset_path = path
|
||||||
|
|
||||||
|
dataset = load_dataset("json", data_files=dataset_path, split="train")
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--base_model",
|
||||||
|
type=str,
|
||||||
|
default="unsloth/Qwen2.5-7B",
|
||||||
|
required=False,
|
||||||
|
help="Base model to fine-tune. Default: unsloth/Qwen2.5-7B",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Dataset to train on",
|
||||||
|
)
|
||||||
|
parser.add_argument("--hub_token", type=str, required=False, help="hf token")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
max_seq_length = 16384 # Choose any! We auto support RoPE Scaling internally!
|
||||||
|
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
||||||
|
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
|
||||||
|
|
||||||
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||||
|
model_name=args.base_model,
|
||||||
|
max_seq_length=max_seq_length,
|
||||||
|
dtype=dtype,
|
||||||
|
load_in_4bit=load_in_4bit,
|
||||||
|
)
|
||||||
|
|
||||||
|
model = FastLanguageModel.get_peft_model(
|
||||||
|
model,
|
||||||
|
r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
||||||
|
target_modules=[
|
||||||
|
"q_proj",
|
||||||
|
"k_proj",
|
||||||
|
"v_proj",
|
||||||
|
"o_proj",
|
||||||
|
"gate_proj",
|
||||||
|
"up_proj",
|
||||||
|
"down_proj",
|
||||||
|
],
|
||||||
|
lora_alpha=16,
|
||||||
|
lora_dropout=0, # Supports any, but = 0 is optimized
|
||||||
|
bias="none", # Supports any, but = "none" is optimized
|
||||||
|
use_gradient_checkpointing="unsloth",
|
||||||
|
random_state=3407,
|
||||||
|
max_seq_length=max_seq_length,
|
||||||
|
use_rslora=False,
|
||||||
|
loftq_config=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
alpaca_prompt = """### Instruction:
|
||||||
|
{}
|
||||||
|
|
||||||
|
### Input:
|
||||||
|
{}
|
||||||
|
|
||||||
|
### Response:
|
||||||
|
{}"""
|
||||||
|
|
||||||
|
DATASET_PATH = args.dataset
|
||||||
|
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
|
||||||
|
|
||||||
|
EOS_TOKEN = tokenizer.eos_token
|
||||||
|
print(f"EOS Token: {EOS_TOKEN}")
|
||||||
|
|
||||||
|
def formatting_func(example):
|
||||||
|
instructions = example["instruction"]
|
||||||
|
inputs = example["input"]
|
||||||
|
outputs = example["output"]
|
||||||
|
texts = []
|
||||||
|
for instruction, input, output in zip(instructions, inputs, outputs):
|
||||||
|
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
|
||||||
|
texts.append(text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"text": texts,
|
||||||
|
}
|
||||||
|
|
||||||
|
dataset = dataset.map(formatting_func, batched=True)
|
||||||
|
|
||||||
|
trainer = SFTTrainer(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
train_dataset=dataset,
|
||||||
|
dataset_text_field="text",
|
||||||
|
max_seq_length=max_seq_length,
|
||||||
|
packing=False,
|
||||||
|
args=TrainingArguments(
|
||||||
|
per_device_train_batch_size=2,
|
||||||
|
gradient_accumulation_steps=4,
|
||||||
|
warmup_ratio=0.05,
|
||||||
|
max_grad_norm=1.0,
|
||||||
|
num_train_epochs=1,
|
||||||
|
learning_rate=2e-5,
|
||||||
|
fp16=not torch.cuda.is_bf16_supported(),
|
||||||
|
bf16=torch.cuda.is_bf16_supported(),
|
||||||
|
logging_steps=10,
|
||||||
|
optim="adamw_8bit",
|
||||||
|
weight_decay=0.1,
|
||||||
|
lr_scheduler_type="linear",
|
||||||
|
seed=3407,
|
||||||
|
output_dir="/output/",
|
||||||
|
report_to=None,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
gpu_stats = torch.cuda.get_device_properties(0)
|
||||||
|
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
||||||
|
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
||||||
|
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
|
||||||
|
print(f"{start_gpu_memory} GB of memory reserved.")
|
||||||
|
|
||||||
|
trainer_stats = trainer.train()
|
||||||
|
|
||||||
|
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
||||||
|
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
|
||||||
|
used_percentage = round(used_memory / max_memory * 100, 3)
|
||||||
|
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
|
||||||
|
|
||||||
|
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
|
||||||
|
print(
|
||||||
|
f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
|
||||||
|
)
|
||||||
|
print(f"Peak reserved memory = {used_memory} GB.")
|
||||||
|
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
|
||||||
|
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
|
||||||
|
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if args.hf_token:
|
||||||
|
model.push_to_hub_gguf(
|
||||||
|
"kuwoyuki/qwen2.5-zh-en-wntl-7b-gguf",
|
||||||
|
tokenizer,
|
||||||
|
quantization_method="q4_k_m",
|
||||||
|
token=args.hf_token,
|
||||||
|
)
|
||||||
|
model.save_pretrained_gguf(
|
||||||
|
"/output/gguf-quant/", tokenizer, quantization_method="q4_k_m"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
|
||||||
|
print(f"✅ Done.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user