diff --git a/.gitignore b/.gitignore index f7de6f9..3b4dc7e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ datasets/ train.en train.zh valid*.en -valid*.zh \ No newline at end of file +valid*.zh +token \ No newline at end of file diff --git a/14b_qwen2.yaml b/14b_qwen2.yaml new file mode 100644 index 0000000..94bed77 --- /dev/null +++ b/14b_qwen2.yaml @@ -0,0 +1,61 @@ +model: "mlx-community/Qwen2.5-14B-bf16" +train: true +fine_tune_type: "lora" +data: "/path/to/training/data" +seed: 42 + +# Number of layers to fine-tune +num_layers: 18 +# Minibatch size. +batch_size: 4 +# (28425 * 3) / 4 = 21319 +iters: 21319 +# Number of validation batches, -1 uses the entire validation set. +val_batches: 50 +# Adam learning rate. +learning_rate: 5e-5 +# Number of training steps between loss reporting. +steps_per_report: 10 +# Number of training steps between validations. +steps_per_eval: 1000 +# Maximum sequence length. +max_seq_length: 16384 + +# Load path to resume training with the given adapter weights. +resume_adapter_file: null +# Save/load path for the trained adapter weights. +adapter_path: "adapters" +# Save the model every N iterations. +save_every: 1000 + +# Evaluate on the test set after training +test: false + +# Number of test set batches, -1 uses the entire test set. +test_batches: 100 + +# Use gradient checkpointing to reduce memory use. +grad_checkpoint: true + +# LoRA specific settings +lora_parameters: + # The layer keys to apply LoRA to. + # These will be applied for the last lora_layers + keys: [ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + "self_attn.o_proj", + "mlp.up_proj", + "mlp.down_proj", + "mlp.gate_proj" + ] + rank: 16 + scale: 16.0 + dropout: 0.05 + +lr_schedule: + name: cosine_decay + warmup: 500 + warmup_init: 1e-7 + arguments: [5e-5, 21319, 1e-7] diff --git a/7b_qwen2.yaml b/7b_qwen2.yaml new file mode 100644 index 0000000..5024029 --- /dev/null +++ b/7b_qwen2.yaml @@ -0,0 +1,61 @@ +model: "mlx-community/Qwen2.5-7B-bf16" +train: true +fine_tune_type: "lora" +data: "/path/to/training/data" +seed: 42 + +# Number of layers to fine-tune +num_layers: 8 +# Minibatch size. +batch_size: 8 +# (28425 * 3) / 8 = 10660 (adjusted for new batch size) +iters: 3553 +# Number of validation batches, -1 uses the entire validation set. +val_batches: 50 +# Adam learning rate. +learning_rate: 3e-5 +# Number of training steps between loss reporting. +steps_per_report: 10 +# Number of training steps between validations. +steps_per_eval: 1000 +# Maximum sequence length. +max_seq_length: 16384 + +# Load path to resume training with the given adapter weights. +resume_adapter_file: null +# Save/load path for the trained adapter weights. +adapter_path: "adapters" +# Save the model every N iterations. +save_every: 1000 + +# Evaluate on the test set after training +test: false + +# Number of test set batches, -1 uses the entire test set. +test_batches: 100 + +# Use gradient checkpointing to reduce memory use. +grad_checkpoint: true + +# LoRA specific settings +lora_parameters: + # The layer keys to apply LoRA to. + # These will be applied for the last lora_layers + keys: [ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + "self_attn.o_proj", + "mlp.up_proj", + "mlp.down_proj", + "mlp.gate_proj" + ] + rank: 16 + scale: 16.0 + dropout: 0.05 + +lr_schedule: + name: cosine_decay + warmup: 500 + warmup_init: 1e-7 + arguments: [5e-5, 3553, 1e-7] diff --git a/mlx_dataset_gen.py b/mlx_dataset_gen.py index cd7f73a..5473da9 100644 --- a/mlx_dataset_gen.py +++ b/mlx_dataset_gen.py @@ -95,13 +95,21 @@ def create_datasets( # Helper function to write datasets def write_dataset(chapters: List[Tuple[str, str]], filepath: str): + template = "### Instruction:\nTranslate the following Chinese text to English:\n\n### Input:\n{}\n\n### Response:\n{}" + with open(filepath, "w", encoding="utf-8") as f: for text_en, text_zh in chapters: processed_en = process_text(text_en) processed_zh = process_text(text_zh) + # entry = { + # "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>" + # } + # entry = {"text": template.format(processed_zh, processed_en)} entry = { - "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>" + "instruction": "Translate the following Chinese text to English:", + "input": processed_zh, + "output": processed_en, } f.write(json.dumps(entry, ensure_ascii=False) + "\n") diff --git a/train.py b/train.py new file mode 100644 index 0000000..84f1ddf --- /dev/null +++ b/train.py @@ -0,0 +1,175 @@ +import argparse + +import torch +from unsloth import FastLanguageModel +from datasets import load_dataset +from trl import SFTTrainer +from transformers import TrainingArguments + + +def load_data(path): + if "drive.google.com" in str(path): + try: + import gdown + + local_path = "downloaded_dataset.json" + gdown.download(url=path, output=local_path, fuzzy=True) + dataset_path = local_path + except ImportError: + raise ImportError("Please install gdown: pip install gdown") + else: + dataset_path = path + + dataset = load_dataset("json", data_files=dataset_path, split="train") + return dataset + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--base_model", + type=str, + default="unsloth/Qwen2.5-7B", + required=False, + help="Base model to fine-tune. Default: unsloth/Qwen2.5-7B", + ) + parser.add_argument( + "--dataset", + type=str, + required=True, + help="Dataset to train on", + ) + parser.add_argument("--hub_token", type=str, required=False, help="hf token") + + args = parser.parse_args() + + max_seq_length = 16384 # Choose any! We auto support RoPE Scaling internally! + dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ + load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. + + model, tokenizer = FastLanguageModel.from_pretrained( + model_name=args.base_model, + max_seq_length=max_seq_length, + dtype=dtype, + load_in_4bit=load_in_4bit, + ) + + model = FastLanguageModel.get_peft_model( + model, + r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], + lora_alpha=16, + lora_dropout=0, # Supports any, but = 0 is optimized + bias="none", # Supports any, but = "none" is optimized + use_gradient_checkpointing="unsloth", + random_state=3407, + max_seq_length=max_seq_length, + use_rslora=False, + loftq_config=None, + ) + + alpaca_prompt = """### Instruction: +{} + +### Input: +{} + +### Response: +{}""" + + DATASET_PATH = args.dataset + dataset = load_dataset("json", data_files=DATASET_PATH, split="train") + + EOS_TOKEN = tokenizer.eos_token + print(f"EOS Token: {EOS_TOKEN}") + + def formatting_func(example): + instructions = example["instruction"] + inputs = example["input"] + outputs = example["output"] + texts = [] + for instruction, input, output in zip(instructions, inputs, outputs): + text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN + texts.append(text) + + return { + "text": texts, + } + + dataset = dataset.map(formatting_func, batched=True) + + trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=max_seq_length, + packing=False, + args=TrainingArguments( + per_device_train_batch_size=2, + gradient_accumulation_steps=4, + warmup_ratio=0.05, + max_grad_norm=1.0, + num_train_epochs=1, + learning_rate=2e-5, + fp16=not torch.cuda.is_bf16_supported(), + bf16=torch.cuda.is_bf16_supported(), + logging_steps=10, + optim="adamw_8bit", + weight_decay=0.1, + lr_scheduler_type="linear", + seed=3407, + output_dir="/output/", + report_to=None, + ), + ) + + gpu_stats = torch.cuda.get_device_properties(0) + start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) + max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) + print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") + print(f"{start_gpu_memory} GB of memory reserved.") + + trainer_stats = trainer.train() + + used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) + used_memory_for_lora = round(used_memory - start_gpu_memory, 3) + used_percentage = round(used_memory / max_memory * 100, 3) + lora_percentage = round(used_memory_for_lora / max_memory * 100, 3) + + print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") + print( + f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training." + ) + print(f"Peak reserved memory = {used_memory} GB.") + print(f"Peak reserved memory for training = {used_memory_for_lora} GB.") + print(f"Peak reserved memory % of max memory = {used_percentage} %.") + print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") + + try: + if args.hf_token: + model.push_to_hub_gguf( + "kuwoyuki/qwen2.5-zh-en-wntl-7b-gguf", + tokenizer, + quantization_method="q4_k_m", + token=args.hf_token, + ) + model.save_pretrained_gguf( + "/output/gguf-quant/", tokenizer, quantization_method="q4_k_m" + ) + except Exception as e: + print(f"Error: {e}") + + print(f"✅ Done.") + + +if __name__ == "__main__": + main()