image_intents / confirmTrain.py
smeintadmin's picture
Upload 16 files
1ae8986
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_from_disk, concatenate_datasets
import os
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
MODEL_NAME = "roberta-large"
SAVE_MODEL_FOLDER = "img_intents_model"
OUTPUT_DIR = "./results"
NEG_NAME = "NEGATIVE"
POS_NAME = "POSITIVE"
# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(SAVE_MODEL_FOLDER)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Load the training arguments
training_args = torch.load(os.path.join(OUTPUT_DIR, "training_args.bin"))
# Load the sentences from the text files into lists
with open('test_positives.txt', 'r') as file:
positives_texts = [line.strip() for line in file.readlines()]
with open('test_negatives.txt', 'r') as file:
negatives_texts = [line.strip() for line in file.readlines()]
# Create datasets from the lists and add a 'label' column
positives_dataset = Dataset.from_dict({'text': positives_texts, 'label': [1]*len(positives_texts)})
negatives_dataset = Dataset.from_dict({'text': negatives_texts, 'label': [0]*len(negatives_texts)})
# Combine into a single dataset
test_dataset = concatenate_datasets([positives_dataset, negatives_dataset])
# Preprocessing function
def preprocess_function(examples):
# Tokenize the texts
return tokenizer(examples["text"], truncation=True, max_length=512, padding='max_length')
test_dataset = test_dataset.map(preprocess_function, batched=True)
# Make sure all your tensors are the same size for batching together
test_dataset = test_dataset.remove_columns(["text"]).rename_column("label", "labels").with_format("torch")
# Create the Trainer object
trainer = Trainer(
model=model,
args=training_args,
)
# Evaluate the model and save predictions and labels
predictions, labels, _ = trainer.predict(test_dataset)
# Convert predictions to binary (0 or 1)
binary_predictions = np.argmax(predictions, axis=1)
# Print overall metrics
accuracy = accuracy_score(labels, binary_predictions)
precision = precision_score(labels, binary_predictions)
recall = recall_score(labels, binary_predictions)
f1 = f1_score(labels, binary_predictions)
print(f"Overall accuracy: {accuracy}")
print(f"Overall precision: {precision}")
print(f"Overall recall: {recall}")
print(f"Overall F1 score: {f1}")
# Print the report for each class
cm = confusion_matrix(labels, binary_predictions)
for i, class_name in enumerate([NEG_NAME, POS_NAME]):
total = cm[i].sum()
correct = cm[i][i]
loss = total - correct
print(f"\n{class_name}:")
print(f"Total: {total}")
print(f"Confirmed: {correct}")
print(f"Loss: {loss} ({loss / total * 100:.2f}%)")