topicto

Runtime error

App Files Files Community

topicto / ber_try.py

DracoMaster

Upload 9 files

817741c verified over 1 year ago

raw

history blame contribute delete

7.57 kB

	from transformers import BertTokenizer, BertForMaskedLM
	import torch
	from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
	import numpy as np
	import matplotlib.pyplot as plt

	# # Step 1: Prepare the dataset
	# # Load your training and validation datasets
	# def read_data(file_path):
	# with open(file_path, 'r', encoding='utf-8') as file:
	# data = file.readlines()
	# return data

	# src_train = read_data('src_train.txt') # File containing original sentences for training
	# tgt_train = read_data('tgt_train.txt') # File containing corresponding simplified sentences for training
	# src_valid = read_data('src_valid.txt') # File containing original sentences for validation
	# tgt_valid = read_data('tgt_valid.txt') # File containing corresponding simplified sentences for validation

	# # Step 2: Fine-tune the BERT model
	# tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-french-wwm-cased')
	# model = BertForMaskedLM.from_pretrained('dbmdz/bert-base-french-wwm-cased')

	# # Fine-tune the model on your training dataset
	# # You need to define the training loop here

	# # Step 3: Evaluate the model
	# def evaluate_model(model, tokenizer, src_valid, tgt_valid):
	# predicted_sentences = []
	# true_labels = []

	# for src_sentence, tgt_sentence in zip(src_valid, tgt_valid):
	# # Tokenize and get predictions
	# tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt')
	# with torch.no_grad():
	# outputs = model(tokenized_sentence)
	# predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy()

	# # Decode predicted sentence
	# predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True)

	# # Append to lists
	# predicted_sentences.append(predicted_sentence)
	# true_labels.append(tgt_sentence)

	# # Calculate evaluation metrics
	# precision = precision_score(true_labels, predicted_sentences, average='weighted')
	# recall = recall_score(true_labels, predicted_sentences, average='weighted')
	# f1 = f1_score(true_labels, predicted_sentences, average='weighted')

	# # Create confusion matrix
	# labels = np.unique(true_labels)
	# cm = confusion_matrix(true_labels, predicted_sentences, labels=labels)

	# return precision, recall, f1, cm

	# precision, recall, f1, confusion_matrix = evaluate_model(model, tokenizer, src_valid, tgt_valid)
	# print("Precision:", precision)
	# print("Recall:", recall)
	# print("F1 Score:", f1)
	# print("Confusion Matrix:")
	# print(confusion_matrix)

	# # Step 4: Analyze the results
	# # Count the number of sentences with perfect matches (>70% match, >50% match, <20% match)

	# def match_percentage(sentence1, sentence2):
	# n = len(sentence1)
	# if n == 0:
	# return 0.0
	# common = sum([1 for x, y in zip(sentence1, sentence2) if x == y])
	# return common / n

	# matches_70 = 0
	# matches_50 = 0
	# matches_20 = 0
	# for pred, true in zip(predicted_sentences, tgt_valid):
	# percentage = match_percentage(pred, true)
	# if percentage > 0.7:
	# matches_70 += 1
	# if percentage > 0.5:
	# matches_50 += 1
	# if percentage < 0.2:
	# matches_20 += 1

	# print("Number of sentences with >70% match:", matches_70)
	# print("Number of sentences with >50% match:", matches_50)
	# print("Number of sentences with <20% match:", matches_20)

	# # Save confusion matrix as image
	# plt.figure(figsize=(8, 6))
	# plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues)
	# plt.title('Confusion Matrix')
	# plt.colorbar()
	# tick_marks = np.arange(len(labels))
	# plt.xticks(tick_marks, labels, rotation=45)
	# plt.yticks(tick_marks, labels)
	# plt.xlabel('Predicted Label')
	# plt.ylabel('True Label')
	# plt.tight_layout()
	# plt.savefig('confusion_matrix.png')


	# Step 1: Prepare the dataset
	# Load your training and validation datasets
	def read_data(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	data = file.readlines()
	return data

	def read_picto_ids(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	data = file.readlines()
	picto_ids = [list(map(int, line.split())) for line in data]
	return picto_ids

	src_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\src_train.txt') # File containing original sentences for training
	tgt_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\tgt_train.txt') # File containing corresponding simplified sentences for training
	picto_train = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\picto_id_train.txt') # File containing picto IDs for training

	src_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\src_valid.txt') # File containing original sentences for validation
	tgt_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\tgt_valid.txt') # File containing corresponding simplified sentences for validation
	picto_valid = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\picto_id_valid.txt') # File containing picto IDs for validation

	# Now src_train, tgt_train, and picto_train are lists containing the sentences and picto IDs from the files.

	# Step 2: Fine-tune the BERT model
	# Same as before

	# Step 3: Evaluate the model
	def evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid):
	predicted_sentences = []
	true_labels = []

	for src_sentence, tgt_sentence, picto_ids in zip(src_valid, tgt_valid, picto_valid):
	# Tokenize and get predictions
	tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt')
	with torch.no_grad():
	outputs = model(tokenized_sentence)
	predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy()

	# Decode predicted sentence
	predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True)

	# Append to lists
	predicted_sentences.append(predicted_sentence)
	true_labels.append(tgt_sentence)

	# Calculate evaluation metrics based on picto IDs
	accuracies = {"100%": 0, "70%": 0, "50%": 0, "20%": 0}
	for pred, true, picto_pred, picto_true in zip(predicted_sentences, true_labels, picto_valid, tgt_valid):
	if pred == true:
	accuracies["100%"] += 1
	elif len(pred.split()) == len(picto_pred) == len(picto_true):
	match_count = sum(1 for x, y in zip(picto_pred, picto_true) if x == y)
	match_percentage = match_count / len(picto_pred)
	if match_percentage >= 0.7:
	accuracies["70%"] += 1
	elif match_percentage >= 0.5:
	accuracies["50%"] += 1
	elif match_percentage >= 0.2:
	accuracies["20%"] += 1

	return accuracies
	from transformers import CamembertModel, CamembertTokenizer

	# You can replace "camembert-base" with any other model from the table, e.g. "camembert/camembert-large".
	tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-wikipedia-4gb")
	camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb")

	accuracies = evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid)
	print("Accuracies based on picto IDs:")
	print(accuracies)