| | |
| | |
| |
|
| | import json |
| | from pathlib import Path |
| |
|
| | import torch |
| | from datasets import Dataset |
| | from peft import LoraConfig, get_peft_model |
| | from transformers import ( |
| | AutoModelForCausalLM, |
| | AutoTokenizer, |
| | DataCollatorForSeq2Seq, |
| | Trainer, |
| | TrainingArguments, |
| | ) |
| |
|
| |
|
| | def resolve_dataset_paths(paths: list[str]) -> list[Path]: |
| | """ |
| | Resolve a mix of files and directories into a list of JSONL files. |
| | Directories are expanded to all *.jsonl files within them. |
| | """ |
| | resolved = [] |
| | for p in paths: |
| | path = Path(p) |
| | if path.is_dir(): |
| | jsonl_files = sorted(path.glob("*.jsonl")) |
| | if not jsonl_files: |
| | print(f" Warning: No .jsonl files found in {path}") |
| | resolved.extend(jsonl_files) |
| | elif path.is_file(): |
| | resolved.append(path) |
| | else: |
| | raise FileNotFoundError(f"Dataset path not found: {path}") |
| | return resolved |
| |
|
| |
|
| | def load_dataset_from_jsonl(paths: list[str]) -> Dataset: |
| | """ |
| | Load one or more JSONL datasets with the format: |
| | {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]} |
| | |
| | Args: |
| | paths: List of file paths or directory paths. Directories are expanded |
| | to all *.jsonl files within them. |
| | |
| | Multiple files are concatenated into a single dataset. |
| | """ |
| | file_paths = resolve_dataset_paths(paths) |
| |
|
| | if not file_paths: |
| | raise ValueError("No dataset files found") |
| |
|
| | examples = [] |
| | for file_path in file_paths: |
| | print(f" Loading: {file_path}") |
| | with open(file_path, "r", encoding="utf-8") as f: |
| | count = 0 |
| | for line in f: |
| | if line.strip(): |
| | examples.append(json.loads(line)) |
| | count += 1 |
| | print(f" -> {count} examples") |
| |
|
| | return Dataset.from_list(examples) |
| |
|
| |
|
| | def format_chat_example(example: dict, tokenizer) -> dict: |
| | """ |
| | Apply the chat template to convert messages into a single string. |
| | Returns the formatted text ready for tokenization. |
| | """ |
| | messages = example["messages"] |
| |
|
| | |
| | |
| | text = tokenizer.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=False, |
| | ) |
| |
|
| | return {"text": text} |
| |
|
| |
|
| | def tokenize_example(example: dict, tokenizer, max_length: int = 512) -> dict: |
| | """ |
| | Tokenize the formatted text. |
| | """ |
| | result = tokenizer( |
| | example["text"], |
| | truncation=True, |
| | max_length=max_length, |
| | padding=False, |
| | ) |
| |
|
| | |
| | result["labels"] = result["input_ids"].copy() |
| |
|
| | return result |
| |
|
| |
|
| | def create_model_and_tokenizer(model_name: str = "Qwen/Qwen2.5-3B-Instruct"): |
| | """ |
| | Load model and tokenizer, apply LoRA configuration. |
| | """ |
| | print(f"Loading model: {model_name}") |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| |
|
| | |
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| |
|
| | |
| | if torch.backends.mps.is_available(): |
| | print("Using Apple MPS (Metal) backend") |
| | device_map = {"": "mps"} |
| | model_dtype = torch.float16 |
| | elif torch.cuda.is_available(): |
| | print("Using CUDA backend") |
| | device_map = "auto" |
| | model_dtype = torch.bfloat16 |
| | else: |
| | print("Using CPU backend (this will be slow)") |
| | device_map = {"": "cpu"} |
| | model_dtype = torch.float32 |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_name, |
| | dtype=model_dtype, |
| | device_map=device_map, |
| | trust_remote_code=True, |
| | ) |
| |
|
| | |
| | print("Applying LoRA configuration...") |
| | lora_config = LoraConfig( |
| | r=16, |
| | lora_alpha=32, |
| | target_modules=[ |
| | "q_proj", |
| | "k_proj", |
| | "v_proj", |
| | "o_proj", |
| | "gate_proj", |
| | "up_proj", |
| | "down_proj", |
| | ], |
| | lora_dropout=0.05, |
| | bias="none", |
| | task_type="CAUSAL_LM", |
| | ) |
| |
|
| | model = get_peft_model(model, lora_config) |
| | model.print_trainable_parameters() |
| |
|
| | return model, tokenizer |
| |
|
| |
|
| | def train( |
| | dataset_paths: list[str], |
| | output_dir: str = "outputs/lora-adapter", |
| | model_name: str = "Qwen/Qwen2.5-3B-Instruct", |
| | num_epochs: int = 3, |
| | batch_size: int = 2, |
| | gradient_accumulation_steps: int = 4, |
| | learning_rate: float = 2e-4, |
| | max_length: int = 512, |
| | val_split: float = 0.1, |
| | ): |
| | """ |
| | Main training function. |
| | |
| | Args: |
| | dataset_paths: List of paths to JSONL training data files |
| | output_dir: Where to save the LoRA adapter |
| | model_name: HuggingFace model ID |
| | num_epochs: Number of training epochs |
| | batch_size: Per-device batch size |
| | gradient_accumulation_steps: Accumulate gradients over N steps |
| | learning_rate: Learning rate for AdamW optimizer |
| | max_length: Maximum sequence length |
| | val_split: Fraction of data to use for validation |
| | """ |
| | print("=" * 60) |
| | print("LoRA Fine-Tuning") |
| | print("=" * 60) |
| |
|
| | |
| | model, tokenizer = create_model_and_tokenizer(model_name) |
| |
|
| | |
| | print(f"\nLoading dataset(s):") |
| | dataset = load_dataset_from_jsonl(dataset_paths) |
| | print(f" Total examples: {len(dataset)}") |
| |
|
| | |
| | print("Applying chat template...") |
| | dataset = dataset.map( |
| | lambda x: format_chat_example(x, tokenizer), |
| | desc="Formatting", |
| | ) |
| |
|
| | |
| | print("Tokenizing...") |
| | dataset = dataset.map( |
| | lambda x: tokenize_example(x, tokenizer, max_length), |
| | remove_columns=dataset.column_names, |
| | desc="Tokenizing", |
| | ) |
| |
|
| | |
| | if val_split > 0: |
| | split = dataset.train_test_split(test_size=val_split, seed=42) |
| | train_dataset = split["train"] |
| | eval_dataset = split["test"] |
| | print(f" Train examples: {len(train_dataset)}") |
| | print(f" Validation examples: {len(eval_dataset)}") |
| | else: |
| | train_dataset = dataset |
| | eval_dataset = None |
| | print(f" Train examples: {len(train_dataset)}") |
| |
|
| | |
| | data_collator = DataCollatorForSeq2Seq( |
| | tokenizer=tokenizer, |
| | padding=True, |
| | return_tensors="pt", |
| | ) |
| |
|
| | |
| | use_mps = torch.backends.mps.is_available() |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir=output_dir, |
| | num_train_epochs=num_epochs, |
| | per_device_train_batch_size=batch_size, |
| | per_device_eval_batch_size=batch_size, |
| | gradient_accumulation_steps=gradient_accumulation_steps, |
| | learning_rate=learning_rate, |
| | weight_decay=0.01, |
| | warmup_ratio=0.1, |
| | logging_steps=10, |
| | save_strategy="epoch", |
| | eval_strategy="epoch" if eval_dataset else "no", |
| | load_best_model_at_end=True if eval_dataset else False, |
| | metric_for_best_model="eval_loss" if eval_dataset else None, |
| | greater_is_better=False, |
| | fp16=use_mps, |
| | bf16=not use_mps and torch.cuda.is_available(), |
| | dataloader_pin_memory=not use_mps, |
| | report_to="none", |
| | remove_unused_columns=False, |
| | ) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=train_dataset, |
| | eval_dataset=eval_dataset, |
| | data_collator=data_collator, |
| | processing_class=tokenizer, |
| | ) |
| |
|
| | |
| | print("\n" + "=" * 60) |
| | print("Starting training...") |
| | print("=" * 60) |
| | trainer.train() |
| |
|
| | |
| | print(f"\nSaving adapter to: {output_dir}") |
| | model.save_pretrained(output_dir) |
| | tokenizer.save_pretrained(output_dir) |
| |
|
| | print("\n" + "=" * 60) |
| | print("Training complete!") |
| | print("=" * 60) |
| | print(f"\nAdapter saved to: {output_dir}") |
| | print( |
| | f"Adapter size: {sum(f.stat().st_size for f in Path(output_dir).glob('*') if f.is_file()) / 1024 / 1024:.1f} MB" |
| | ) |
| |
|
| | return model, tokenizer |
| |
|
| |
|
| | def test_model(model, tokenizer, test_diary: str): |
| | """ |
| | Test the fine-tuned model on a sample diary entry. |
| | """ |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": f"Diary: {test_diary}\n\nWhat is the disease activity score for today?", |
| | } |
| | ] |
| |
|
| | text = tokenizer.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=True, |
| | ) |
| |
|
| | inputs = tokenizer(text, return_tensors="pt").to(model.device) |
| |
|
| | with torch.no_grad(): |
| | outputs = model.generate( |
| | **inputs, |
| | max_new_tokens=5, |
| | do_sample=False, |
| | pad_token_id=tokenizer.pad_token_id, |
| | ) |
| |
|
| | response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | |
| | score = None |
| | for char in reversed(response): |
| | if char.isdigit(): |
| | score = char |
| | break |
| |
|
| | return score, response |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import argparse |
| |
|
| | parser = argparse.ArgumentParser(description="Fine-tune Qwen2.5 with LoRA") |
| | parser.add_argument( |
| | "--dataset", |
| | type=str, |
| | nargs="+", |
| | required=True, |
| | help="Path(s) to training dataset(s) (JSONL). Multiple files are concatenated.", |
| | ) |
| | parser.add_argument( |
| | "--output", |
| | type=str, |
| | default="outputs/lora-adapter", |
| | help="Output directory for the adapter", |
| | ) |
| | parser.add_argument( |
| | "--epochs", |
| | type=int, |
| | default=3, |
| | help="Number of training epochs", |
| | ) |
| | parser.add_argument( |
| | "--batch-size", |
| | type=int, |
| | default=2, |
| | help="Per-device batch size", |
| | ) |
| | parser.add_argument( |
| | "--lr", |
| | type=float, |
| | default=2e-4, |
| | help="Learning rate", |
| | ) |
| |
|
| | args = parser.parse_args() |
| |
|
| | |
| | model, tokenizer = train( |
| | dataset_paths=args.dataset, |
| | output_dir=args.output, |
| | num_epochs=args.epochs, |
| | batch_size=args.batch_size, |
| | learning_rate=args.lr, |
| | ) |
| |
|
| | |
| | print("\n" + "=" * 60) |
| | print("Testing the fine-tuned model...") |
| | print("=" * 60) |
| |
|
| | test_diaries = [ |
| | "I felt fine today, no pain at all. Went for a walk and felt great.", |
| | "Severe pain in my joints all day. Had to stay in bed. Medication didn't help much.", |
| | "Some stiffness this morning but it went away. Managed to work from home.", |
| | ] |
| |
|
| | for diary in test_diaries: |
| | score, _ = test_model(model, tokenizer, diary) |
| | print(f"\nDiary: {diary[:60]}...") |
| | print(f"Predicted score: {score}") |
| |
|