diff --git a/train.py b/train.py
index 84f1ddf..93eb045 100644
--- a/train.py
+++ b/train.py
@@ -45,7 +45,7 @@ def main():
 
     max_seq_length = 16384  # Choose any! We auto support RoPE Scaling internally!
     dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
-    load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.
+    load_in_4bit = False  # Use 4bit quantization to reduce memory usage. Can be False.
 
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name=args.base_model,
@@ -114,18 +114,18 @@ def main():
         max_seq_length=max_seq_length,
         packing=False,
         args=TrainingArguments(
-            per_device_train_batch_size=2,
-            gradient_accumulation_steps=4,
+            per_device_train_batch_size=16,
+            gradient_accumulation_steps=2,
             warmup_ratio=0.05,
             max_grad_norm=1.0,
             num_train_epochs=1,
-            learning_rate=2e-5,
+            learning_rate=1e-4,
             fp16=not torch.cuda.is_bf16_supported(),
             bf16=torch.cuda.is_bf16_supported(),
-            logging_steps=10,
+            logging_steps=50,
             optim="adamw_8bit",
-            weight_decay=0.1,
-            lr_scheduler_type="linear",
+            weight_decay=0.05,
+            lr_scheduler_type="cosine",
             seed=3407,
             output_dir="/output/",
             report_to=None,