Sheikh-2.5-Coder / train.py
likhonsheikh's picture
Add train.py
14b6e56 verified
#!/usr/bin/env python3
"""
Sheikh-2.5-Coder Training Script
================================
This script handles the training pipeline for Sheikh-2.5-Coder model.
"""
import os
import torch
import argparse
from typing import Optional
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForSeq2Seq,
get_linear_schedule_with_warmup,
)
from datasets import load_dataset, Dataset
import yaml
from model import SheikhModel, SheikhConfig, setup_training_args
def load_config(config_path: str) -> dict:
"""Load training configuration from YAML file."""
with open(config_path, 'r') as f:
return yaml.safe_load(f)
def prepare_training_data(data_config: dict) -> Dataset:
"""Prepare training dataset."""
# This would be implemented based on your specific data sources
# For now, return a placeholder
print("Loading training data...")
# Example data preparation logic would go here
# This might involve loading from Hugging Face datasets
# or custom data sources
# Placeholder: return empty dataset for now
train_dataset = Dataset.from_dict({
'input_ids': [[1, 2, 3, 4, 5]],
'attention_mask': [[1, 1, 1, 1, 1]],
'labels': [[2, 3, 4, 5, 6]]
})
return train_dataset
def setup_model_and_tokenizer(config: dict) -> tuple:
"""Setup model and tokenizer."""
print("Initializing model and tokenizer...")
# Load tokenizer (would be from a base model for continued training)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
# Create model configuration
model_config = SheikhConfig(
vocab_size=tokenizer.vocab_size,
hidden_size=config['model']['hidden_size'],
num_attention_heads=config['model']['num_attention_heads'],
num_key_value_heads=config['model']['num_key_value_heads'],
num_hidden_layers=config['model']['num_hidden_layers'],
intermediate_size=config['model']['intermediate_size'],
max_position_embeddings=config['model']['context_length'],
)
# Initialize model
model = SheikhModel(model_config)
# Resize token embeddings if needed
model.resize_token_embeddings(len(tokenizer))
# Ensure tokenizer has proper padding token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return model, tokenizer
def train_model(
model,
tokenizer,
train_dataset,
eval_dataset,
config: dict,
output_dir: str,
):
"""Train the Sheikh-2.5-Coder model."""
# Setup training arguments
training_config = config['training']
args = setup_training_args(
output_dir=output_dir,
learning_rate=training_config['learning_rate']
)
# Data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer=tokenizer,
model=model,
padding=True,
return_tensors="pt",
)
# Initialize trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
tokenizer=tokenizer,
)
# Start training
print("Starting training...")
trainer.train()
# Save final model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Training completed! Model saved to {output_dir}")
def main():
parser = argparse.ArgumentParser(description="Train Sheikh-2.5-Coder model")
parser.add_argument(
"--config",
type=str,
default="training_config.yaml",
help="Path to training configuration file"
)
parser.add_argument(
"--output-dir",
type=str,
default="./sheikh-2.5-coder-output",
help="Directory to save the trained model"
)
parser.add_argument(
"--resume-from-checkpoint",
type=str,
default=None,
help="Path to checkpoint to resume from"
)
args = parser.parse_args()
# Load configuration
config = load_config(args.config)
# Setup model and tokenizer
model, tokenizer = setup_model_and_tokenizer(config)
# Prepare training data
train_dataset = prepare_training_data(config['data'])
eval_dataset = prepare_training_data(config['data']) # Placeholder
# Create output directory
os.makedirs(args.output_dir, exist_ok=True)
# Train model
train_model(
model=model,
tokenizer=tokenizer,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
config=config,
output_dir=args.output_dir
)
if __name__ == "__main__":
main()