diff --git a/.gitignore b/.gitignore
index f7de6f9..3b4dc7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ datasets/
 train.en
 train.zh
 valid*.en
-valid*.zh
\ No newline at end of file
+valid*.zh
+token
\ No newline at end of file
diff --git a/14b_qwen2.yaml b/14b_qwen2.yaml
new file mode 100644
index 0000000..94bed77
--- /dev/null
+++ b/14b_qwen2.yaml
@@ -0,0 +1,61 @@
+model: "mlx-community/Qwen2.5-14B-bf16"
+train: true
+fine_tune_type: "lora"
+data: "/path/to/training/data"
+seed: 42
+
+# Number of layers to fine-tune
+num_layers: 18
+# Minibatch size.
+batch_size: 4
+# (28425 * 3) / 4 = 21319
+iters: 21319
+# Number of validation batches, -1 uses the entire validation set.
+val_batches: 50
+# Adam learning rate.
+learning_rate: 5e-5
+# Number of training steps between loss reporting.
+steps_per_report: 10
+# Number of training steps between validations.
+steps_per_eval: 1000
+# Maximum sequence length.
+max_seq_length: 16384
+
+# Load path to resume training with the given adapter weights.
+resume_adapter_file: null
+# Save/load path for the trained adapter weights.
+adapter_path: "adapters"
+# Save the model every N iterations.
+save_every: 1000
+
+# Evaluate on the test set after training
+test: false
+
+# Number of test set batches, -1 uses the entire test set.
+test_batches: 100
+
+# Use gradient checkpointing to reduce memory use.
+grad_checkpoint: true
+
+# LoRA specific settings
+lora_parameters:
+  # The layer keys to apply LoRA to.
+  # These will be applied for the last lora_layers
+  keys: [
+    "self_attn.q_proj",
+    "self_attn.k_proj",
+    "self_attn.v_proj", 
+    "self_attn.o_proj",
+    "mlp.up_proj",
+    "mlp.down_proj",
+    "mlp.gate_proj" 
+  ]
+  rank: 16
+  scale: 16.0
+  dropout: 0.05
+
+lr_schedule:
+  name: cosine_decay
+  warmup: 500
+  warmup_init: 1e-7
+  arguments: [5e-5, 21319, 1e-7]
diff --git a/7b_qwen2.yaml b/7b_qwen2.yaml
new file mode 100644
index 0000000..5024029
--- /dev/null
+++ b/7b_qwen2.yaml
@@ -0,0 +1,61 @@
+model: "mlx-community/Qwen2.5-7B-bf16"
+train: true
+fine_tune_type: "lora"
+data: "/path/to/training/data"
+seed: 42
+
+# Number of layers to fine-tune
+num_layers: 8
+# Minibatch size.
+batch_size: 8
+# (28425 * 3) / 8 = 10660 (adjusted for new batch size)
+iters: 3553
+# Number of validation batches, -1 uses the entire validation set.
+val_batches: 50
+# Adam learning rate.
+learning_rate: 3e-5
+# Number of training steps between loss reporting.
+steps_per_report: 10
+# Number of training steps between validations.
+steps_per_eval: 1000
+# Maximum sequence length.
+max_seq_length: 16384
+
+# Load path to resume training with the given adapter weights.
+resume_adapter_file: null
+# Save/load path for the trained adapter weights.
+adapter_path: "adapters"
+# Save the model every N iterations.
+save_every: 1000
+
+# Evaluate on the test set after training
+test: false
+
+# Number of test set batches, -1 uses the entire test set.
+test_batches: 100
+
+# Use gradient checkpointing to reduce memory use.
+grad_checkpoint: true
+
+# LoRA specific settings
+lora_parameters:
+  # The layer keys to apply LoRA to.
+  # These will be applied for the last lora_layers
+  keys: [
+    "self_attn.q_proj",
+    "self_attn.k_proj",
+    "self_attn.v_proj", 
+    "self_attn.o_proj",
+    "mlp.up_proj",
+    "mlp.down_proj",
+    "mlp.gate_proj" 
+  ]
+  rank: 16
+  scale: 16.0
+  dropout: 0.05
+
+lr_schedule:
+  name: cosine_decay
+  warmup: 500
+  warmup_init: 1e-7
+  arguments: [5e-5, 3553, 1e-7]
diff --git a/mlx_dataset_gen.py b/mlx_dataset_gen.py
index cd7f73a..5473da9 100644
--- a/mlx_dataset_gen.py
+++ b/mlx_dataset_gen.py
@@ -95,13 +95,21 @@ def create_datasets(
 
         # Helper function to write datasets
         def write_dataset(chapters: List[Tuple[str, str]], filepath: str):
+            template = "### Instruction:\nTranslate the following Chinese text to English:\n\n### Input:\n{}\n\n### Response:\n{}"
+
             with open(filepath, "w", encoding="utf-8") as f:
                 for text_en, text_zh in chapters:
                     processed_en = process_text(text_en)
                     processed_zh = process_text(text_zh)
 
+                    # entry = {
+                    #     "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
+                    # }
+                    # entry = {"text": template.format(processed_zh, processed_en)}
                     entry = {
-                        "text": f"<|im_start|>user\n{processed_zh}<|im_end|>\n<|im_start|>assistant\n{processed_en}<|im_end|>"
+                        "instruction": "Translate the following Chinese text to English:",
+                        "input": processed_zh,
+                        "output": processed_en,
                     }
                     f.write(json.dumps(entry, ensure_ascii=False) + "\n")
 
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..84f1ddf
--- /dev/null
+++ b/train.py
@@ -0,0 +1,175 @@
+import argparse
+
+import torch
+from unsloth import FastLanguageModel
+from datasets import load_dataset
+from trl import SFTTrainer
+from transformers import TrainingArguments
+
+
+def load_data(path):
+    if "drive.google.com" in str(path):
+        try:
+            import gdown
+
+            local_path = "downloaded_dataset.json"
+            gdown.download(url=path, output=local_path, fuzzy=True)
+            dataset_path = local_path
+        except ImportError:
+            raise ImportError("Please install gdown: pip install gdown")
+    else:
+        dataset_path = path
+
+    dataset = load_dataset("json", data_files=dataset_path, split="train")
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model",
+        type=str,
+        default="unsloth/Qwen2.5-7B",
+        required=False,
+        help="Base model to fine-tune. Default: unsloth/Qwen2.5-7B",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="Dataset to train on",
+    )
+    parser.add_argument("--hub_token", type=str, required=False, help="hf token")
+
+    args = parser.parse_args()
+
+    max_seq_length = 16384  # Choose any! We auto support RoPE Scaling internally!
+    dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+    load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.
+
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=args.base_model,
+        max_seq_length=max_seq_length,
+        dtype=dtype,
+        load_in_4bit=load_in_4bit,
+    )
+
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+        lora_alpha=16,
+        lora_dropout=0,  # Supports any, but = 0 is optimized
+        bias="none",  # Supports any, but = "none" is optimized
+        use_gradient_checkpointing="unsloth",
+        random_state=3407,
+        max_seq_length=max_seq_length,
+        use_rslora=False,
+        loftq_config=None,
+    )
+
+    alpaca_prompt = """### Instruction:
+{}
+
+### Input:
+{}
+
+### Response:
+{}"""
+
+    DATASET_PATH = args.dataset
+    dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
+
+    EOS_TOKEN = tokenizer.eos_token
+    print(f"EOS Token: {EOS_TOKEN}")
+
+    def formatting_func(example):
+        instructions = example["instruction"]
+        inputs = example["input"]
+        outputs = example["output"]
+        texts = []
+        for instruction, input, output in zip(instructions, inputs, outputs):
+            text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
+            texts.append(text)
+
+        return {
+            "text": texts,
+        }
+
+    dataset = dataset.map(formatting_func, batched=True)
+
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=max_seq_length,
+        packing=False,
+        args=TrainingArguments(
+            per_device_train_batch_size=2,
+            gradient_accumulation_steps=4,
+            warmup_ratio=0.05,
+            max_grad_norm=1.0,
+            num_train_epochs=1,
+            learning_rate=2e-5,
+            fp16=not torch.cuda.is_bf16_supported(),
+            bf16=torch.cuda.is_bf16_supported(),
+            logging_steps=10,
+            optim="adamw_8bit",
+            weight_decay=0.1,
+            lr_scheduler_type="linear",
+            seed=3407,
+            output_dir="/output/",
+            report_to=None,
+        ),
+    )
+
+    gpu_stats = torch.cuda.get_device_properties(0)
+    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
+    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
+    print(f"{start_gpu_memory} GB of memory reserved.")
+
+    trainer_stats = trainer.train()
+
+    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
+    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
+    used_percentage = round(used_memory / max_memory * 100, 3)
+    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
+
+    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
+    print(
+        f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training."
+    )
+    print(f"Peak reserved memory = {used_memory} GB.")
+    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
+    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
+    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
+
+    try:
+        if args.hf_token:
+            model.push_to_hub_gguf(
+                "kuwoyuki/qwen2.5-zh-en-wntl-7b-gguf",
+                tokenizer,
+                quantization_method="q4_k_m",
+                token=args.hf_token,
+            )
+        model.save_pretrained_gguf(
+            "/output/gguf-quant/", tokenizer, quantization_method="q4_k_m"
+        )
+    except Exception as e:
+        print(f"Error: {e}")
+
+    print(f"✅ Done.")
+
+
+if __name__ == "__main__":
+    main()