diff --git a/train.py b/train.py index 84f1ddf..93eb045 100644 --- a/train.py +++ b/train.py @@ -45,7 +45,7 @@ def main(): max_seq_length = 16384 # Choose any! We auto support RoPE Scaling internally! dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ - load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. + load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False. model, tokenizer = FastLanguageModel.from_pretrained( model_name=args.base_model, @@ -114,18 +114,18 @@ def main(): max_seq_length=max_seq_length, packing=False, args=TrainingArguments( - per_device_train_batch_size=2, - gradient_accumulation_steps=4, + per_device_train_batch_size=16, + gradient_accumulation_steps=2, warmup_ratio=0.05, max_grad_norm=1.0, num_train_epochs=1, - learning_rate=2e-5, + learning_rate=1e-4, fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), - logging_steps=10, + logging_steps=50, optim="adamw_8bit", - weight_decay=0.1, - lr_scheduler_type="linear", + weight_decay=0.05, + lr_scheduler_type="cosine", seed=3407, output_dir="/output/", report_to=None,