|
|
|
|
|
""" |
|
|
Sheikh-2.5-Coder Training Script |
|
|
================================ |
|
|
|
|
|
This script handles the training pipeline for Sheikh-2.5-Coder model. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import torch |
|
|
import argparse |
|
|
from typing import Optional |
|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
AutoModelForCausalLM, |
|
|
TrainingArguments, |
|
|
Trainer, |
|
|
DataCollatorForSeq2Seq, |
|
|
get_linear_schedule_with_warmup, |
|
|
) |
|
|
from datasets import load_dataset, Dataset |
|
|
import yaml |
|
|
from model import SheikhModel, SheikhConfig, setup_training_args |
|
|
|
|
|
def load_config(config_path: str) -> dict: |
|
|
"""Load training configuration from YAML file.""" |
|
|
with open(config_path, 'r') as f: |
|
|
return yaml.safe_load(f) |
|
|
|
|
|
def prepare_training_data(data_config: dict) -> Dataset: |
|
|
"""Prepare training dataset.""" |
|
|
|
|
|
|
|
|
print("Loading training data...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_dataset = Dataset.from_dict({ |
|
|
'input_ids': [[1, 2, 3, 4, 5]], |
|
|
'attention_mask': [[1, 1, 1, 1, 1]], |
|
|
'labels': [[2, 3, 4, 5, 6]] |
|
|
}) |
|
|
|
|
|
return train_dataset |
|
|
|
|
|
def setup_model_and_tokenizer(config: dict) -> tuple: |
|
|
"""Setup model and tokenizer.""" |
|
|
print("Initializing model and tokenizer...") |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B") |
|
|
|
|
|
|
|
|
model_config = SheikhConfig( |
|
|
vocab_size=tokenizer.vocab_size, |
|
|
hidden_size=config['model']['hidden_size'], |
|
|
num_attention_heads=config['model']['num_attention_heads'], |
|
|
num_key_value_heads=config['model']['num_key_value_heads'], |
|
|
num_hidden_layers=config['model']['num_hidden_layers'], |
|
|
intermediate_size=config['model']['intermediate_size'], |
|
|
max_position_embeddings=config['model']['context_length'], |
|
|
) |
|
|
|
|
|
|
|
|
model = SheikhModel(model_config) |
|
|
|
|
|
|
|
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
return model, tokenizer |
|
|
|
|
|
def train_model( |
|
|
model, |
|
|
tokenizer, |
|
|
train_dataset, |
|
|
eval_dataset, |
|
|
config: dict, |
|
|
output_dir: str, |
|
|
): |
|
|
"""Train the Sheikh-2.5-Coder model.""" |
|
|
|
|
|
|
|
|
training_config = config['training'] |
|
|
args = setup_training_args( |
|
|
output_dir=output_dir, |
|
|
learning_rate=training_config['learning_rate'] |
|
|
) |
|
|
|
|
|
|
|
|
data_collator = DataCollatorForSeq2Seq( |
|
|
tokenizer=tokenizer, |
|
|
model=model, |
|
|
padding=True, |
|
|
return_tensors="pt", |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=args, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
data_collator=data_collator, |
|
|
tokenizer=tokenizer, |
|
|
) |
|
|
|
|
|
|
|
|
print("Starting training...") |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
trainer.save_model(output_dir) |
|
|
tokenizer.save_pretrained(output_dir) |
|
|
|
|
|
print(f"Training completed! Model saved to {output_dir}") |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Train Sheikh-2.5-Coder model") |
|
|
parser.add_argument( |
|
|
"--config", |
|
|
type=str, |
|
|
default="training_config.yaml", |
|
|
help="Path to training configuration file" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output-dir", |
|
|
type=str, |
|
|
default="./sheikh-2.5-coder-output", |
|
|
help="Directory to save the trained model" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--resume-from-checkpoint", |
|
|
type=str, |
|
|
default=None, |
|
|
help="Path to checkpoint to resume from" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
config = load_config(args.config) |
|
|
|
|
|
|
|
|
model, tokenizer = setup_model_and_tokenizer(config) |
|
|
|
|
|
|
|
|
train_dataset = prepare_training_data(config['data']) |
|
|
eval_dataset = prepare_training_data(config['data']) |
|
|
|
|
|
|
|
|
os.makedirs(args.output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
train_model( |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
config=config, |
|
|
output_dir=args.output_dir |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |