smeintadmin
/

image_intents

Text Classification

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

image_intents / confirmTrain.py

smeintadmin's picture

Upload 16 files

1ae8986 over 1 year ago

history blame contribute delete

2.83 kB

	from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
	from datasets import Dataset, load_from_disk, concatenate_datasets
	import os
	import torch
	import numpy as np
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

	MODEL_NAME = "roberta-large"
	SAVE_MODEL_FOLDER = "img_intents_model"
	OUTPUT_DIR = "./results"
	NEG_NAME = "NEGATIVE"
	POS_NAME = "POSITIVE"

	# Load the model and tokenizer
	model = AutoModelForSequenceClassification.from_pretrained(SAVE_MODEL_FOLDER)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	# Load the training arguments
	training_args = torch.load(os.path.join(OUTPUT_DIR, "training_args.bin"))

	# Load the sentences from the text files into lists
	with open('test_positives.txt', 'r') as file:
	positives_texts = [line.strip() for line in file.readlines()]
	with open('test_negatives.txt', 'r') as file:
	negatives_texts = [line.strip() for line in file.readlines()]

	# Create datasets from the lists and add a 'label' column
	positives_dataset = Dataset.from_dict({'text': positives_texts, 'label': [1]*len(positives_texts)})
	negatives_dataset = Dataset.from_dict({'text': negatives_texts, 'label': [0]*len(negatives_texts)})

	# Combine into a single dataset
	test_dataset = concatenate_datasets([positives_dataset, negatives_dataset])

	# Preprocessing function
	def preprocess_function(examples):
	# Tokenize the texts
	return tokenizer(examples["text"], truncation=True, max_length=512, padding='max_length')

	test_dataset = test_dataset.map(preprocess_function, batched=True)

	# Make sure all your tensors are the same size for batching together
	test_dataset = test_dataset.remove_columns(["text"]).rename_column("label", "labels").with_format("torch")

	# Create the Trainer object
	trainer = Trainer(
	model=model,
	args=training_args,
	)

	# Evaluate the model and save predictions and labels
	predictions, labels, _ = trainer.predict(test_dataset)

	# Convert predictions to binary (0 or 1)
	binary_predictions = np.argmax(predictions, axis=1)

	# Print overall metrics
	accuracy = accuracy_score(labels, binary_predictions)
	precision = precision_score(labels, binary_predictions)
	recall = recall_score(labels, binary_predictions)
	f1 = f1_score(labels, binary_predictions)

	print(f"Overall accuracy: {accuracy}")
	print(f"Overall precision: {precision}")
	print(f"Overall recall: {recall}")
	print(f"Overall F1 score: {f1}")

	# Print the report for each class
	cm = confusion_matrix(labels, binary_predictions)

	for i, class_name in enumerate([NEG_NAME, POS_NAME]):
	total = cm[i].sum()
	correct = cm[i][i]
	loss = total - correct

	print(f"\n{class_name}:")
	print(f"Total: {total}")
	print(f"Confirmed: {correct}")
	print(f"Loss: {loss} ({loss / total * 100:.2f}%)")