import argparse import torch from unsloth import FastLanguageModel from datasets import load_dataset from trl import SFTTrainer from transformers import TrainingArguments def load_data(path): if "drive.google.com" in str(path): try: import gdown local_path = "downloaded_dataset.json" gdown.download(url=path, output=local_path, fuzzy=True) dataset_path = local_path except ImportError: raise ImportError("Please install gdown: pip install gdown") else: dataset_path = path dataset = load_dataset("json", data_files=dataset_path, split="train") return dataset def main(): parser = argparse.ArgumentParser() parser.add_argument( "--base_model", type=str, default="unsloth/Qwen2.5-7B", required=False, help="Base model to fine-tune. Default: unsloth/Qwen2.5-7B", ) parser.add_argument( "--dataset", type=str, required=True, help="Dataset to train on", ) parser.add_argument("--hub_token", type=str, required=False, help="hf token") args = parser.parse_args() max_seq_length = 16384 # Choose any! We auto support RoPE Scaling internally! dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False. model, tokenizer = FastLanguageModel.from_pretrained( model_name=args.base_model, max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit, ) model = FastLanguageModel.get_peft_model( model, r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], lora_alpha=16, lora_dropout=0, # Supports any, but = 0 is optimized bias="none", # Supports any, but = "none" is optimized use_gradient_checkpointing="unsloth", random_state=3407, max_seq_length=max_seq_length, use_rslora=False, loftq_config=None, ) alpaca_prompt = """### Instruction: {} ### Input: {} ### Response: {}""" DATASET_PATH = args.dataset dataset = load_dataset("json", data_files=DATASET_PATH, split="train") EOS_TOKEN = tokenizer.eos_token print(f"EOS Token: {EOS_TOKEN}") def formatting_func(example): instructions = example["instruction"] inputs = example["input"] outputs = example["output"] texts = [] for instruction, input, output in zip(instructions, inputs, outputs): text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN texts.append(text) return { "text": texts, } dataset = dataset.map(formatting_func, batched=True) trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field="text", max_seq_length=max_seq_length, packing=False, args=TrainingArguments( per_device_train_batch_size=16, gradient_accumulation_steps=2, warmup_ratio=0.05, max_grad_norm=1.0, num_train_epochs=1, learning_rate=1e-4, fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), logging_steps=50, optim="adamw_8bit", weight_decay=0.05, lr_scheduler_type="cosine", seed=3407, output_dir="/output/", report_to=None, ), ) gpu_stats = torch.cuda.get_device_properties(0) start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") print(f"{start_gpu_memory} GB of memory reserved.") trainer_stats = trainer.train() used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) used_memory_for_lora = round(used_memory - start_gpu_memory, 3) used_percentage = round(used_memory / max_memory * 100, 3) lora_percentage = round(used_memory_for_lora / max_memory * 100, 3) print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") print( f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training." ) print(f"Peak reserved memory = {used_memory} GB.") print(f"Peak reserved memory for training = {used_memory_for_lora} GB.") print(f"Peak reserved memory % of max memory = {used_percentage} %.") print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") try: if args.hf_token: model.push_to_hub_gguf( "kuwoyuki/qwen2.5-zh-en-wntl-7b-gguf", tokenizer, quantization_method="q4_k_m", token=args.hf_token, ) model.save_pretrained_gguf( "/output/gguf-quant/", tokenizer, quantization_method="q4_k_m" ) except Exception as e: print(f"Error: {e}") print(f"✅ Done.") if __name__ == "__main__": main()