diff --git a/merger.py b/merger.py new file mode 100755 index 0000000..8f1e3d1 --- /dev/null +++ b/merger.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +import json +import argparse +import random +from collections.abc import Sequence +from pathlib import Path +from typing import TypedDict + + +class AlpacaEntry(TypedDict): + input: str + output: str + instruction: str | None + + +class Args(argparse.Namespace): + def __init__(self) -> None: + super().__init__() + self.file1: str = "" + self.file2: str = "" + self.output: str = "" + self.shuffle: bool = False + self.seed: int | None = None + self.omit_instruction: bool = False + + +def load_file(file_path: str) -> Sequence[AlpacaEntry]: + """Load and validate a file in JSON or JSONL format.""" + path = Path(file_path) + is_jsonl = path.suffix.lower() in (".jsonl", ".ljson") + + with open(file_path, "r", encoding="utf-8") as f: + if is_jsonl: + data: list[AlpacaEntry] = [] + for line in f: + line = line.strip() + if line: # Skip empty lines + entry: AlpacaEntry = json.loads(line) + data.append(entry) + else: + data = json.load(f) + + # Validate required fields + for item in data: + if not all(key in item for key in ["input", "output"]): + raise ValueError( + f"Missing required fields in {file_path}. Each item must have 'input' and 'output' fields." + ) + + return data + + +def write_output(data: Sequence[AlpacaEntry], output_path: str) -> None: + """Write output in JSON or JSONL format based on file extension.""" + path = Path(output_path) + is_jsonl = path.suffix.lower() in (".jsonl", ".ljson") + print(path, is_jsonl) + + with open(output_path, "w", encoding="utf-8") as f: + if is_jsonl: + for item in data: + _ = f.write(json.dumps(item, ensure_ascii=False) + "\n") + else: + json.dump(data, f, ensure_ascii=False, indent=2) + + +def merge_datasets( + file1: str, file2: str, shuffle: bool = False, omit_instruction: bool = False +) -> Sequence[AlpacaEntry]: + """Merge two files in JSON/JSONL format.""" + # Load both files + data1 = load_file(file1) + data2 = load_file(file2) + merged_data = list(data1) + list(data2) + + if omit_instruction: + for item in merged_data: + _ = item.pop("instruction", None) + + if shuffle: + random.shuffle(merged_data) + + return merged_data + + +def parse_args() -> Args: + """Parse and validate command line arguments.""" + parser = argparse.ArgumentParser(description="Merge two Alpaca-format JSON files") + _ = parser.add_argument("file1", type=str, help="Path to first JSON file") + _ = parser.add_argument("file2", type=str, help="Path to second JSON file") + _ = parser.add_argument("output", type=str, help="Path to output merged JSON file") + _ = parser.add_argument( + "--shuffle", action="store_true", help="Shuffle the merged dataset" + ) + _ = parser.add_argument("--seed", type=int, help="Random seed for shuffling") + _ = parser.add_argument( + "--omit-instruction", + action="store_true", + help="Omit instruction field from output", + ) + + return parser.parse_args(namespace=Args()) + + +def main() -> None: + args = parse_args() + + try: + if args.seed is not None: + random.seed(args.seed) + + merged_data = merge_datasets( + args.file1, args.file2, args.shuffle, args.omit_instruction + ) + + write_output(merged_data, args.output) + print(f"Successfully merged files. Total entries: {len(merged_data)}") + + except Exception as e: + print(f"Error: {str(e)}") + exit(1) + + +if __name__ == "__main__": + main() diff --git a/train_torchtune.yaml b/train_torchtune.yaml new file mode 100644 index 0000000..e7d6103 --- /dev/null +++ b/train_torchtune.yaml @@ -0,0 +1,120 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Qwen2.5 7B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2.5-7B-Instruct +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config qwen2_5/7B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config qwen2_5/7B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +output_dir: /home/mira/models/qwen2_5_7B_tune/lora_single_device # /tmp may be deleted by your system. Change it to your preference. + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_7b_base + lora_attn_modules: ["q_proj", "k_proj", "v_proj", "output_proj"] + apply_lora_to_mlp: True + apply_lora_to_output: True + lora_rank: 32 # higher increases accuracy and memory + lora_alpha: 64 # usually alpha=2*rank + lora_dropout: 0.05 + quantize_base: True + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /home/mira/models/Qwen2.5-7B-Base/vocab.json + merges_file: /home/mira/models/Qwen2.5-7B-Base/merges.txt + max_seq_len: 16384 + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /home/mira/models/Qwen2.5-7B-Base + checkpoint_files: + [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors, + ] + recipe_checkpoint: null + output_dir: ${output_dir} + model_type: QWEN2 +save_every_n_steps: 100 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: True # True increases speed +seed: 42 +shuffle: False +batch_size: 1 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 1e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 32 # Use to increase effective batch size +clip_grad_norm: null +compile: True # torch.compile the model + loss, True increases speed + decreases memory + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir}/logs +log_every_n_steps: 5 +log_peak_memory_stats: True + +# Environment +device: cuda +dtype: fp16 + +# Activations Offloading +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: True # True reduces memory + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1