import os from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer from datasets import load_dataset import json # Load configuration with open('../config/config.json') as f: config = json.load(f) # Load dataset dataset = load_dataset('csv', data_files={'train': '../data/train.csv', 'validation': '../data/valid.csv'}) # Load model and tokenizer model = AutoModelForSequenceClassification.from_pretrained(config['model_name'], num_labels=config['num_labels']) tokenizer = AutoTokenizer.from_pretrained(config['model_name']) # Tokenize dataset def tokenize_function(examples): return tokenizer(examples['text'], padding="max_length", truncation=True) tokenized_datasets = dataset.map(tokenize_function, batched=True) # Training arguments training_args = TrainingArguments( output_dir='./results', learning_rate=config['learning_rate'], per_device_train_batch_size=config['batch_size'], num_train_epochs=config['num_epochs'], evaluation_strategy="epoch", save_strategy="epoch", logging_dir='./logs' ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['validation'], tokenizer=tokenizer ) trainer.train() trainer.save_model('../model')