mujib71 / scripts /model_trainer.py
likhonsheikhdev's picture
Upload 28 files
cce70aa verified
import json
from pathlib import Path
import logging
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
import wandb
import numpy as np
from datasets import load_dataset
from typing import Dict, List, Any
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class BengaliCodeDataset(Dataset):
def __init__(self, data_path: Path, tokenizer, max_length: int = 2048):
self.tokenizer = tokenizer
self.max_length = max_length
# Load the processed data
with open(data_path, 'r', encoding='utf-8') as f:
self.data = json.load(f)
logger.info(f"Loaded {len(self.data)} examples from {data_path}")
def __len__(self):
return len(self.data)
def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
item = self.data[idx]
text = item['text']
# Tokenize the text
encodings = self.tokenizer(
text,
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
# Prepare the labels (same as input_ids for causal language modeling)
labels = encodings.input_ids.clone()
# Create attention mask
attention_mask = encodings.attention_mask
return {
'input_ids': encodings.input_ids[0],
'attention_mask': attention_mask[0],
'labels': labels[0]
}
class ModelTrainer:
def __init__(self):
self.data_dir = Path('data/raw')
self.tokenizer_dir = Path('outputs/tokenizer')
self.output_dir = Path('outputs/model')
self.output_dir.mkdir(parents=True, exist_ok=True)
# Training configuration
self.model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
self.max_length = 2048
self.batch_size = 4
self.gradient_accumulation_steps = 4
self.learning_rate = 2e-5
self.num_train_epochs = 3
self.warmup_steps = 100
self.save_steps = 1000
self.eval_steps = 500
def setup_wandb(self):
"""Initialize Weights & Biases tracking"""
wandb.init(
project="bengali-code-llm",
name="tinyllama-bengali-code",
config={
"model_name": self.model_name,
"max_length": self.max_length,
"batch_size": self.batch_size,
"learning_rate": self.learning_rate,
"num_epochs": self.num_train_epochs
}
)
def prepare_model_and_tokenizer(self):
"""Load and prepare the model and tokenizer"""
logger.info("Loading tokenizer and model")
# Load the custom tokenizer
tokenizer = AutoTokenizer.from_pretrained(
self.tokenizer_dir,
model_max_length=self.max_length
)
# Load the base model
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
trust_remote_code=True,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
)
# Resize token embeddings to match our tokenizer
model.resize_token_embeddings(len(tokenizer))
return model, tokenizer
def create_datasets(self, tokenizer):
"""Create training and validation datasets"""
logger.info("Creating datasets")
# Load the processed data
data_path = self.data_dir / 'processed_data.json'
# Split data into train and validation
with open(data_path, 'r', encoding='utf-8') as f:
all_data = json.load(f)
np.random.seed(42)
np.random.shuffle(all_data)
split_idx = int(len(all_data) * 0.9) # 90% train, 10% validation
train_data = all_data[:split_idx]
val_data = all_data[split_idx:]
# Save split data
train_path = self.data_dir / 'train.json'
val_path = self.data_dir / 'validation.json'
with open(train_path, 'w', encoding='utf-8') as f:
json.dump(train_data, f, ensure_ascii=False, indent=2)
with open(val_path, 'w', encoding='utf-8') as f:
json.dump(val_data, f, ensure_ascii=False, indent=2)
# Create datasets
train_dataset = BengaliCodeDataset(train_path, tokenizer, self.max_length)
val_dataset = BengaliCodeDataset(val_path, tokenizer, self.max_length)
return train_dataset, val_dataset
def create_training_arguments(self):
"""Create training arguments for the Trainer"""
return TrainingArguments(
output_dir=str(self.output_dir),
num_train_epochs=self.num_train_epochs,
per_device_train_batch_size=self.batch_size,
per_device_eval_batch_size=self.batch_size,
gradient_accumulation_steps=self.gradient_accumulation_steps,
evaluation_strategy="steps",
eval_steps=self.eval_steps,
save_strategy="steps",
save_steps=self.save_steps,
learning_rate=self.learning_rate,
warmup_steps=self.warmup_steps,
weight_decay=0.01,
logging_dir=str(self.output_dir / 'logs'),
logging_steps=100,
report_to="wandb",
save_total_limit=3,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
fp16=torch.cuda.is_available(),
remove_unused_columns=False
)
def train(self):
"""Main method to train the model"""
try:
# Initialize wandb
self.setup_wandb()
# Prepare model and tokenizer
model, tokenizer = self.prepare_model_and_tokenizer()
# Create datasets
train_dataset, val_dataset = self.create_datasets(tokenizer)
# Create training arguments
training_args = self.create_training_arguments()
# Create data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False # We're doing causal language modeling
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=data_collator,
tokenizer=tokenizer
)
# Train the model
logger.info("Starting model training")
trainer.train()
# Save the final model
trainer.save_model(str(self.output_dir / 'final'))
tokenizer.save_pretrained(str(self.output_dir / 'final'))
# Close wandb
wandb.finish()
logger.info("Model training completed successfully")
except Exception as e:
logger.error(f"Model training failed: {str(e)}")
raise
finally:
# Ensure wandb is properly closed
if wandb.run is not None:
wandb.finish()
if __name__ == "__main__":
trainer = ModelTrainer()
trainer.train()