chore: _
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -7,4 +7,5 @@ datasets/
|
||||
train.en
|
||||
train.zh
|
||||
valid*.en
|
||||
valid*.zh
|
||||
valid*.zh
|
||||
token
|
||||
61
14b_qwen2.yaml
Normal file
61
14b_qwen2.yaml
Normal file
@@ -0,0 +1,61 @@
|
||||
model: "mlx-community/Qwen2.5-14B-bf16"
|
||||
train: true
|
||||
fine_tune_type: "lora"
|
||||
data: "/path/to/training/data"
|
||||
seed: 42
|
||||
|
||||
# Number of layers to fine-tune
|
||||
num_layers: 18
|
||||
# Minibatch size.
|
||||
batch_size: 4
|
||||
# (28425 * 3) / 4 = 21319
|
||||
iters: 21319
|
||||
# Number of validation batches, -1 uses the entire validation set.
|
||||
val_batches: 50
|
||||
# Adam learning rate.
|
||||
learning_rate: 5e-5
|
||||
# Number of training steps between loss reporting.
|
||||
steps_per_report: 10
|
||||
# Number of training steps between validations.
|
||||
steps_per_eval: 1000
|
||||
# Maximum sequence length.
|
||||
max_seq_length: 16384
|
||||
|
||||
# Load path to resume training with the given adapter weights.
|
||||
resume_adapter_file: null
|
||||
# Save/load path for the trained adapter weights.
|
||||
adapter_path: "adapters"
|
||||
# Save the model every N iterations.
|
||||
save_every: 1000
|
||||
|
||||
# Evaluate on the test set after training
|
||||
test: false
|
||||
|
||||
# Number of test set batches, -1 uses the entire test set.
|
||||
test_batches: 100
|
||||
|
||||
# Use gradient checkpointing to reduce memory use.
|
||||
grad_checkpoint: true
|
||||
|
||||
# LoRA specific settings
|
||||
lora_parameters:
|
||||
# The layer keys to apply LoRA to.
|
||||
# These will be applied for the last lora_layers
|
||||
keys: [
|
||||
"self_attn.q_proj",
|
||||
"self_attn.k_proj",
|
||||
"self_attn.v_proj",
|
||||
"self_attn.o_proj",
|
||||
"mlp.up_proj",
|
||||
"mlp.down_proj",
|
||||
"mlp.gate_proj"
|
||||
]
|
||||
rank: 16
|
||||
scale: 16.0
|
||||
dropout: 0.05
|
||||
|
||||
lr_schedule:
|
||||
name: cosine_decay
|
||||
warmup: 500
|
||||
warmup_init: 1e-7
|
||||
arguments: [5e-5, 21319, 1e-7]
|
||||
61
7b_qwen2.yaml
Normal file
61
7b_qwen2.yaml
Normal file
@@ -0,0 +1,61 @@
|
||||
model: "mlx-community/Qwen2.5-7B-bf16"
|
||||
train: true
|
||||
fine_tune_type: "lora"
|
||||
data: "/path/to/training/data"
|
||||
seed: 42
|
||||
|
||||
# Number of layers to fine-tune
|
||||
num_layers: 8
|
||||
# Minibatch size.
|
||||
batch_size: 8
|
||||
# (28425 * 3) / 8 = 10660 (adjusted for new batch size)
|
||||
iters: 3553
|
||||
# Number of validation batches, -1 uses the entire validation set.
|
||||
val_batches: 50
|
||||
# Adam learning rate.
|
||||
learning_rate: 3e-5
|
||||
# Number of training steps between loss reporting.
|
||||
steps_per_report: 10
|
||||
# Number of training steps between validations.
|
||||
steps_per_eval: 1000
|
||||
# Maximum sequence length.
|
||||
max_seq_length: 16384
|
||||
|
||||
# Load path to resume training with the given adapter weights.
|
||||
resume_adapter_file: null
|
||||
# Save/load path for the trained adapter weights.
|
||||
adapter_path: "adapters"
|
||||
# Save the model every N iterations.
|
||||
save_every: 1000
|
||||
|
||||
# Evaluate on the test set after training
|
||||
test: false
|
||||
|
||||
# Number of test set batches, -1 uses the entire test set.
|
||||
test_batches: 100
|
||||
|
||||
# Use gradient checkpointing to reduce memory use.
|
||||
grad_checkpoint: true
|
||||
|
||||
# LoRA specific settings
|
||||
lora_parameters:
|
||||
# The layer keys to apply LoRA to.
|
||||
# These will be applied for the last lora_layers
|
||||
keys: [
|
||||
"self_attn.q_proj",
|
||||
"self_attn.k_proj",
|
||||
"self_attn.v_proj",
|
||||
"self_attn.o_proj",
|
||||
"mlp.up_proj",
|
||||
"mlp.down_proj",
|
||||
"mlp.gate_proj"
|
||||
]
|
||||
rank: 16
|
||||
scale: 16.0
|
||||
dropout: 0.05
|
||||
|
||||
lr_schedule:
|
||||
name: cosine_decay
|
||||
warmup: 500
|
||||
warmup_init: 1e-7
|
||||
arguments: [5e-5, 3553, 1e-7]
|
||||
@@ -95,13 +95,21 @@ def create_datasets(
|
||||
|
||||
# Helper function to write datasets
|
||||
def write_dataset(chapters: List[Tuple[str, str]], filepath: str):
|
||||
template = "### Instruction:\nTranslate the following Chinese text to English:\n\n### Input:\n{}\n\n### Response:\n{}"
|
||||
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
for text_en, text_zh in chapters:
|
||||
processed_en = process_text(text_en)
|
||||
processed_zh = process_text(text_zh)
|
||||
|
||||
# entry = {
|
||||
# "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
|
||||
# }
|
||||
# entry = {"text": template.format(processed_zh, processed_en)}
|
||||
entry = {
|
||||
"text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
|
||||
"instruction": "Translate the following Chinese text to English:",
|
||||
"input": processed_zh,
|
||||
"output": processed_en,
|
||||
}
|
||||
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
175
train.py
Normal file
175
train.py
Normal file
@@ -0,0 +1,175 @@
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
from unsloth import FastLanguageModel
|
||||
from datasets import load_dataset
|
||||
from trl import SFTTrainer
|
||||
from transformers import TrainingArguments
|
||||
|
||||
|
||||
def load_data(path):
|
||||
if "drive.google.com" in str(path):
|
||||
try:
|
||||
import gdown
|
||||
|
||||
local_path = "downloaded_dataset.json"
|
||||
gdown.download(url=path, output=local_path, fuzzy=True)
|
||||
dataset_path = local_path
|
||||
except ImportError:
|
||||
raise ImportError("Please install gdown: pip install gdown")
|
||||
else:
|
||||
dataset_path = path
|
||||
|
||||
dataset = load_dataset("json", data_files=dataset_path, split="train")
|
||||
return dataset
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--base_model",
|
||||
type=str,
|
||||
default="unsloth/Qwen2.5-7B",
|
||||
required=False,
|
||||
help="Base model to fine-tune. Default: unsloth/Qwen2.5-7B",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Dataset to train on",
|
||||
)
|
||||
parser.add_argument("--hub_token", type=str, required=False, help="hf token")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
max_seq_length = 16384 # Choose any! We auto support RoPE Scaling internally!
|
||||
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
||||
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
|
||||
|
||||
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||
model_name=args.base_model,
|
||||
max_seq_length=max_seq_length,
|
||||
dtype=dtype,
|
||||
load_in_4bit=load_in_4bit,
|
||||
)
|
||||
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
||||
target_modules=[
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
"o_proj",
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
"down_proj",
|
||||
],
|
||||
lora_alpha=16,
|
||||
lora_dropout=0, # Supports any, but = 0 is optimized
|
||||
bias="none", # Supports any, but = "none" is optimized
|
||||
use_gradient_checkpointing="unsloth",
|
||||
random_state=3407,
|
||||
max_seq_length=max_seq_length,
|
||||
use_rslora=False,
|
||||
loftq_config=None,
|
||||
)
|
||||
|
||||
alpaca_prompt = """### Instruction:
|
||||
{}
|
||||
|
||||
### Input:
|
||||
{}
|
||||
|
||||
### Response:
|
||||
{}"""
|
||||
|
||||
DATASET_PATH = args.dataset
|
||||
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
|
||||
|
||||
EOS_TOKEN = tokenizer.eos_token
|
||||
print(f"EOS Token: {EOS_TOKEN}")
|
||||
|
||||
def formatting_func(example):
|
||||
instructions = example["instruction"]
|
||||
inputs = example["input"]
|
||||
outputs = example["output"]
|
||||
texts = []
|
||||
for instruction, input, output in zip(instructions, inputs, outputs):
|
||||
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
|
||||
texts.append(text)
|
||||
|
||||
return {
|
||||
"text": texts,
|
||||
}
|
||||
|
||||
dataset = dataset.map(formatting_func, batched=True)
|
||||
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
train_dataset=dataset,
|
||||
dataset_text_field="text",
|
||||
max_seq_length=max_seq_length,
|
||||
packing=False,
|
||||
args=TrainingArguments(
|
||||
per_device_train_batch_size=2,
|
||||
gradient_accumulation_steps=4,
|
||||
warmup_ratio=0.05,
|
||||
max_grad_norm=1.0,
|
||||
num_train_epochs=1,
|
||||
learning_rate=2e-5,
|
||||
fp16=not torch.cuda.is_bf16_supported(),
|
||||
bf16=torch.cuda.is_bf16_supported(),
|
||||
logging_steps=10,
|
||||
optim="adamw_8bit",
|
||||
weight_decay=0.1,
|
||||
lr_scheduler_type="linear",
|
||||
seed=3407,
|
||||
output_dir="/output/",
|
||||
report_to=None,
|
||||
),
|
||||
)
|
||||
|
||||
gpu_stats = torch.cuda.get_device_properties(0)
|
||||
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
||||
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
||||
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
|
||||
print(f"{start_gpu_memory} GB of memory reserved.")
|
||||
|
||||
trainer_stats = trainer.train()
|
||||
|
||||
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
|
||||
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
|
||||
used_percentage = round(used_memory / max_memory * 100, 3)
|
||||
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
|
||||
|
||||
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
|
||||
print(
|
||||
f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
|
||||
)
|
||||
print(f"Peak reserved memory = {used_memory} GB.")
|
||||
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
|
||||
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
|
||||
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
|
||||
|
||||
try:
|
||||
if args.hf_token:
|
||||
model.push_to_hub_gguf(
|
||||
"kuwoyuki/qwen2.5-zh-en-wntl-7b-gguf",
|
||||
tokenizer,
|
||||
quantization_method="q4_k_m",
|
||||
token=args.hf_token,
|
||||
)
|
||||
model.save_pretrained_gguf(
|
||||
"/output/gguf-quant/", tokenizer, quantization_method="q4_k_m"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
print(f"✅ Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user