chore: _

2025-02-11 20:30:49 +06:00
parent befdc9c945
commit c7609f8328
5 changed files with 308 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ datasets/
 train.en
 train.zh
 valid*.en
-valid*.zh
+valid*.zh
 token
--- a/14b_qwen2.yaml
+++ b/14b_qwen2.yaml
@@ -0,0 +1,61 @@
 model: "mlx-community/Qwen2.5-14B-bf16"
 train: true
 fine_tune_type: "lora"
 data: "/path/to/training/data"
 seed: 42
 # Number of layers to fine-tune
 num_layers: 18
 # Minibatch size.
 batch_size: 4
 # (28425 * 3) / 4 = 21319
 iters: 21319
 # Number of validation batches, -1 uses the entire validation set.
 val_batches: 50
 # Adam learning rate.
 learning_rate: 5e-5
 # Number of training steps between loss reporting.
 steps_per_report: 10
 # Number of training steps between validations.
 steps_per_eval: 1000
 # Maximum sequence length.
 max_seq_length: 16384
 # Load path to resume training with the given adapter weights.
 resume_adapter_file: null
 # Save/load path for the trained adapter weights.
 adapter_path: "adapters"
 # Save the model every N iterations.
 save_every: 1000
 # Evaluate on the test set after training
 test: false
 # Number of test set batches, -1 uses the entire test set.
 test_batches: 100
 # Use gradient checkpointing to reduce memory use.
 grad_checkpoint: true
 # LoRA specific settings
 lora_parameters:
  # The layer keys to apply LoRA to.
  # These will be applied for the last lora_layers
  keys: [
    "self_attn.q_proj",
    "self_attn.k_proj",
    "self_attn.v_proj", 
    "self_attn.o_proj",
    "mlp.up_proj",
    "mlp.down_proj",
    "mlp.gate_proj" 
  ]
  rank: 16
  scale: 16.0
  dropout: 0.05
 lr_schedule:
  name: cosine_decay
  warmup: 500
  warmup_init: 1e-7
  arguments: [5e-5, 21319, 1e-7]
--- a/7b_qwen2.yaml
+++ b/7b_qwen2.yaml
@@ -0,0 +1,61 @@
 model: "mlx-community/Qwen2.5-7B-bf16"
 train: true
 fine_tune_type: "lora"
 data: "/path/to/training/data"
 seed: 42
 # Number of layers to fine-tune
 num_layers: 8
 # Minibatch size.
 batch_size: 8
 # (28425 * 3) / 8 = 10660 (adjusted for new batch size)
 iters: 3553
 # Number of validation batches, -1 uses the entire validation set.
 val_batches: 50
 # Adam learning rate.
 learning_rate: 3e-5
 # Number of training steps between loss reporting.
 steps_per_report: 10
 # Number of training steps between validations.
 steps_per_eval: 1000
 # Maximum sequence length.
 max_seq_length: 16384
 # Load path to resume training with the given adapter weights.
 resume_adapter_file: null
 # Save/load path for the trained adapter weights.
 adapter_path: "adapters"
 # Save the model every N iterations.
 save_every: 1000
 # Evaluate on the test set after training
 test: false
 # Number of test set batches, -1 uses the entire test set.
 test_batches: 100
 # Use gradient checkpointing to reduce memory use.
 grad_checkpoint: true
 # LoRA specific settings
 lora_parameters:
  # The layer keys to apply LoRA to.
  # These will be applied for the last lora_layers
  keys: [
    "self_attn.q_proj",
    "self_attn.k_proj",
    "self_attn.v_proj", 
    "self_attn.o_proj",
    "mlp.up_proj",
    "mlp.down_proj",
    "mlp.gate_proj" 
  ]
  rank: 16
  scale: 16.0
  dropout: 0.05
 lr_schedule:
  name: cosine_decay
  warmup: 500
  warmup_init: 1e-7
  arguments: [5e-5, 3553, 1e-7]
--- a/mlx_dataset_gen.py
+++ b/mlx_dataset_gen.py
@@ -95,13 +95,21 @@ def create_datasets(
        # Helper function to write datasets
        def write_dataset(chapters: List[Tuple[str, str]], filepath: str):
            template = "### Instruction:\nTranslate the following Chinese text to English:\n\n### Input:\n{}\n\n### Response:\n{}"
            with open(filepath, "w", encoding="utf-8") as f:
                for text_en, text_zh in chapters:
                    processed_en = process_text(text_en)
                    processed_zh = process_text(text_zh)
                    # entry = {
                    #     "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
                    # }
                    # entry = {"text": template.format(processed_zh, processed_en)}
                    entry = {
-                        "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
+                        "instruction": "Translate the following Chinese text to English:",
                        "input": processed_zh,
                        "output": processed_en,
                    }
                    f.write(json.dumps(entry, ensure_ascii=False) + "\n")
--- a/train.py
+++ b/train.py
@@ -0,0 +1,175 @@
 import argparse
 import torch
 from unsloth import FastLanguageModel
 from datasets import load_dataset
 from trl import SFTTrainer
 from transformers import TrainingArguments
 def load_data(path):
    if "drive.google.com" in str(path):
        try:
            import gdown
            local_path = "downloaded_dataset.json"
            gdown.download(url=path, output=local_path, fuzzy=True)
            dataset_path = local_path
        except ImportError:
            raise ImportError("Please install gdown: pip install gdown")
    else:
        dataset_path = path
    dataset = load_dataset("json", data_files=dataset_path, split="train")
    return dataset
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--base_model",
        type=str,
        default="unsloth/Qwen2.5-7B",
        required=False,
        help="Base model to fine-tune. Default: unsloth/Qwen2.5-7B",
    )
    parser.add_argument(
        "--dataset",
        type=str,
        required=True,
        help="Dataset to train on",
    )
    parser.add_argument("--hub_token", type=str, required=False, help="hf token")
    args = parser.parse_args()
    max_seq_length = 16384  # Choose any! We auto support RoPE Scaling internally!
    dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=args.base_model,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_alpha=16,
        lora_dropout=0,  # Supports any, but = 0 is optimized
        bias="none",  # Supports any, but = "none" is optimized
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        max_seq_length=max_seq_length,
        use_rslora=False,
        loftq_config=None,
    )
    alpaca_prompt = """### Instruction:
 {}
 ### Input:
 {}
 ### Response:
 {}"""
    DATASET_PATH = args.dataset
    dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
    EOS_TOKEN = tokenizer.eos_token
    print(f"EOS Token: {EOS_TOKEN}")
    def formatting_func(example):
        instructions = example["instruction"]
        inputs = example["input"]
        outputs = example["output"]
        texts = []
        for instruction, input, output in zip(instructions, inputs, outputs):
            text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
            texts.append(text)
        return {
            "text": texts,
        }
    dataset = dataset.map(formatting_func, batched=True)
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_ratio=0.05,
            max_grad_norm=1.0,
            num_train_epochs=1,
            learning_rate=2e-5,
            fp16=not torch.cuda.is_bf16_supported(),
            bf16=torch.cuda.is_bf16_supported(),
            logging_steps=10,
            optim="adamw_8bit",
            weight_decay=0.1,
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="/output/",
            report_to=None,
        ),
    )
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")
    trainer_stats = trainer.train()
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(
        f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
    )
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
    try:
        if args.hf_token:
            model.push_to_hub_gguf(
                "kuwoyuki/qwen2.5-zh-en-wntl-7b-gguf",
                tokenizer,
                quantization_method="q4_k_m",
                token=args.hf_token,
            )
        model.save_pretrained_gguf(
            "/output/gguf-quant/", tokenizer, quantization_method="q4_k_m"
        )
    except Exception as e:
        print(f"Error: {e}")
    print(f"✅ Done.")
 if __name__ == "__main__":
    main()