|
""" |
|
tool_trainer_intensive.py - Intensive Training for 80% Target |
|
|
|
This trainer implements: |
|
1. 10+ epochs (vs 3 before) |
|
2. Better learning rate schedule |
|
3. Optimized training parameters |
|
4. Progress monitoring for 80% target |
|
""" |
|
|
|
import torch |
|
from transformers import ( |
|
AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, |
|
DataCollatorForLanguageModeling |
|
) |
|
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
|
from datasets import Dataset |
|
import json |
|
import time |
|
|
|
def load_training_data(file_path="tool_pairs_massive.jsonl"): |
|
"""Load the massive training dataset.""" |
|
pairs = [] |
|
with open(file_path, 'r') as f: |
|
for line in f: |
|
pairs.append(json.loads(line.strip())) |
|
return pairs |
|
|
|
def format_training_data(pairs, tokenizer): |
|
"""Format training data for the model.""" |
|
formatted = [] |
|
for pair in pairs: |
|
|
|
full_text = pair["prompt"] + pair["chosen"] + tokenizer.eos_token |
|
formatted.append({"text": full_text}) |
|
return formatted |
|
|
|
def tokenize_function(examples, tokenizer, max_length=400): |
|
"""Tokenize with optimized settings for intensive training.""" |
|
tokenized = tokenizer( |
|
examples["text"], |
|
truncation=True, |
|
padding="max_length", |
|
max_length=max_length, |
|
return_tensors=None |
|
) |
|
|
|
|
|
tokenized["labels"] = tokenized["input_ids"] |
|
return tokenized |
|
|
|
def main(): |
|
print("π INTENSIVE Training: SmolLM3-3B for 80% Target") |
|
print("=" * 60) |
|
|
|
|
|
device = "mps" if torch.backends.mps.is_available() else "cpu" |
|
print(f"β
Using device: {device}") |
|
|
|
start_time = time.time() |
|
|
|
|
|
print("π₯ Loading SmolLM3-3B...") |
|
model_name = "HuggingFaceTB/SmolLM3-3B" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype=torch.float32, |
|
device_map={"": device} if device == "mps" else "auto" |
|
) |
|
|
|
print(f"β
Model loaded: {model.num_parameters() / 1e9:.1f}B params") |
|
|
|
|
|
print("π© Setting up enhanced LoRA (rank 32)...") |
|
lora_config = LoraConfig( |
|
r=32, |
|
lora_alpha=64, |
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], |
|
lora_dropout=0.1, |
|
bias="none", |
|
task_type="CAUSAL_LM" |
|
) |
|
|
|
model = get_peft_model(model, lora_config) |
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) |
|
total_params = sum(p.numel() for p in model.parameters()) |
|
print(f"π― Trainable: {trainable_params:,} parameters ({100 * trainable_params / total_params:.2f}%)") |
|
|
|
|
|
print("π Loading massive training data...") |
|
pairs = load_training_data() |
|
print(f"β
{len(pairs)} training examples ready") |
|
|
|
|
|
print("π€ Tokenizing massive dataset...") |
|
formatted_data = format_training_data(pairs, tokenizer) |
|
dataset = Dataset.from_list(formatted_data) |
|
|
|
tokenized_dataset = dataset.map( |
|
lambda x: tokenize_function(x, tokenizer), |
|
batched=True, |
|
remove_columns=dataset.column_names |
|
) |
|
print(f"π Tokenized {len(tokenized_dataset)} examples") |
|
|
|
|
|
print("βοΈ Configuring intensive training...") |
|
training_args = TrainingArguments( |
|
output_dir="./smollm3_intensive", |
|
num_train_epochs=12, |
|
per_device_train_batch_size=2, |
|
gradient_accumulation_steps=4, |
|
warmup_steps=100, |
|
learning_rate=3e-5, |
|
lr_scheduler_type="cosine", |
|
weight_decay=0.01, |
|
logging_steps=10, |
|
save_steps=100, |
|
save_total_limit=3, |
|
push_to_hub=False, |
|
report_to=None, |
|
dataloader_pin_memory=False, |
|
fp16=False, |
|
gradient_checkpointing=True, |
|
max_grad_norm=1.0, |
|
adam_epsilon=1e-8, |
|
adam_beta1=0.9, |
|
adam_beta2=0.999, |
|
) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, |
|
mlm=False, |
|
pad_to_multiple_of=8, |
|
) |
|
|
|
|
|
print("ποΈ Initializing intensive trainer...") |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset, |
|
data_collator=data_collator, |
|
) |
|
|
|
|
|
print("π― Starting INTENSIVE training...") |
|
print(f"π Dataset: {len(pairs)} examples") |
|
print(f"π Epochs: 12 (vs 3 before)") |
|
print(f"π Learning rate: 3e-5 with cosine schedule") |
|
print(f"β±οΈ Expected time: ~10-15 minutes") |
|
print("π Monitoring for dramatic improvement...") |
|
|
|
train_result = trainer.train() |
|
|
|
training_time = time.time() - start_time |
|
print(f"\nπ INTENSIVE Training completed!") |
|
print(f"π Final loss: {train_result.training_loss:.4f}") |
|
print(f"β±οΈ Training time: {training_time:.1f}s") |
|
|
|
|
|
print("πΎ Saving intensively trained model...") |
|
model.save_pretrained("./smollm3_intensive") |
|
tokenizer.save_pretrained("./smollm3_intensive") |
|
|
|
|
|
print("π§ͺ Quick validation test...") |
|
model.eval() |
|
test_input = "Get weather for New York" |
|
inputs = tokenizer(test_input, return_tensors="pt").to(device) |
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=100, |
|
temperature=0.1, |
|
do_sample=True, |
|
pad_token_id=tokenizer.eos_token_id |
|
) |
|
|
|
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) |
|
print(f"π€ Model response: {response}") |
|
|
|
|
|
try: |
|
parsed = json.loads(response.strip()) |
|
print(f"β
Valid JSON! {parsed}") |
|
except json.JSONDecodeError as e: |
|
print(f"β JSON error: {e}") |
|
|
|
print(f"\nπ Intensive training complete!") |
|
print(f"π Ready for 80% target evaluation") |
|
|
|
return model, tokenizer |
|
|
|
if __name__ == "__main__": |
|
model, tokenizer = main() |