import pandas as pd from sklearn.model_selection import train_test_split from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments import torch from datasets import Dataset import wandb from sklearn.metrics import precision_recall_fscore_support, accuracy_score # Load dataset data = pd.read_csv('sentences.csv') # Split dataset into train and eval sets train_df, eval_df = train_test_split(data, test_size=0.2, random_state=42) # Convert to Hugging Face Dataset train_dataset = Dataset.from_pandas(train_df) eval_dataset = Dataset.from_pandas(eval_df) # Initialize the tokenizer and model model_name = 'classla/bcms-bertic' tokenizer = ElectraTokenizer.from_pretrained(model_name) model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=3) # Tokenize the datasets def tokenize_function(examples): return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128) train_dataset = train_dataset.map(tokenize_function, batched=True) eval_dataset = eval_dataset.map(tokenize_function, batched=True) # Set format for PyTorch train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) # Define the compute_metrics function def compute_metrics(p): preds = p.predictions.argmax(-1) precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted') acc = accuracy_score(p.label_ids, preds) return { 'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1 } # Define the training arguments training_args = TrainingArguments( output_dir='./results', evaluation_strategy='epoch', save_strategy='epoch', learning_rate=1e-5, per_device_train_batch_size=128, per_device_eval_batch_size=128, num_train_epochs=20, weight_decay=0.01, warmup_steps=500, logging_dir='./logs', logging_steps=10, save_total_limit=20, load_best_model_at_end=True, metric_for_best_model='accuracy', report_to='wandb', run_name='sentiment-classification', ) # Initialize WandB wandb.init(project="sentiment-classification", entity="dejan") # Define Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics ) # Train the model trainer.train() # Evaluate the model trainer.evaluate() # Finish the WandB run wandb.finish() # Save the model model.save_pretrained('./sentiment-model') tokenizer.save_pretrained('./sentiment-model')