topicto / ber_try.py
DracoMaster's picture
Upload 9 files
817741c verified
raw
history blame contribute delete
No virus
7.57 kB
from transformers import BertTokenizer, BertForMaskedLM
import torch
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
# # Step 1: Prepare the dataset
# # Load your training and validation datasets
# def read_data(file_path):
# with open(file_path, 'r', encoding='utf-8') as file:
# data = file.readlines()
# return data
# src_train = read_data('src_train.txt') # File containing original sentences for training
# tgt_train = read_data('tgt_train.txt') # File containing corresponding simplified sentences for training
# src_valid = read_data('src_valid.txt') # File containing original sentences for validation
# tgt_valid = read_data('tgt_valid.txt') # File containing corresponding simplified sentences for validation
# # Step 2: Fine-tune the BERT model
# tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-french-wwm-cased')
# model = BertForMaskedLM.from_pretrained('dbmdz/bert-base-french-wwm-cased')
# # Fine-tune the model on your training dataset
# # You need to define the training loop here
# # Step 3: Evaluate the model
# def evaluate_model(model, tokenizer, src_valid, tgt_valid):
# predicted_sentences = []
# true_labels = []
# for src_sentence, tgt_sentence in zip(src_valid, tgt_valid):
# # Tokenize and get predictions
# tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt')
# with torch.no_grad():
# outputs = model(tokenized_sentence)
# predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy()
# # Decode predicted sentence
# predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True)
# # Append to lists
# predicted_sentences.append(predicted_sentence)
# true_labels.append(tgt_sentence)
# # Calculate evaluation metrics
# precision = precision_score(true_labels, predicted_sentences, average='weighted')
# recall = recall_score(true_labels, predicted_sentences, average='weighted')
# f1 = f1_score(true_labels, predicted_sentences, average='weighted')
# # Create confusion matrix
# labels = np.unique(true_labels)
# cm = confusion_matrix(true_labels, predicted_sentences, labels=labels)
# return precision, recall, f1, cm
# precision, recall, f1, confusion_matrix = evaluate_model(model, tokenizer, src_valid, tgt_valid)
# print("Precision:", precision)
# print("Recall:", recall)
# print("F1 Score:", f1)
# print("Confusion Matrix:")
# print(confusion_matrix)
# # Step 4: Analyze the results
# # Count the number of sentences with perfect matches (>70% match, >50% match, <20% match)
# def match_percentage(sentence1, sentence2):
# n = len(sentence1)
# if n == 0:
# return 0.0
# common = sum([1 for x, y in zip(sentence1, sentence2) if x == y])
# return common / n
# matches_70 = 0
# matches_50 = 0
# matches_20 = 0
# for pred, true in zip(predicted_sentences, tgt_valid):
# percentage = match_percentage(pred, true)
# if percentage > 0.7:
# matches_70 += 1
# if percentage > 0.5:
# matches_50 += 1
# if percentage < 0.2:
# matches_20 += 1
# print("Number of sentences with >70% match:", matches_70)
# print("Number of sentences with >50% match:", matches_50)
# print("Number of sentences with <20% match:", matches_20)
# # Save confusion matrix as image
# plt.figure(figsize=(8, 6))
# plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.colorbar()
# tick_marks = np.arange(len(labels))
# plt.xticks(tick_marks, labels, rotation=45)
# plt.yticks(tick_marks, labels)
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.tight_layout()
# plt.savefig('confusion_matrix.png')
# Step 1: Prepare the dataset
# Load your training and validation datasets
def read_data(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
data = file.readlines()
return data
def read_picto_ids(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
data = file.readlines()
picto_ids = [list(map(int, line.split())) for line in data]
return picto_ids
src_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\src_train.txt') # File containing original sentences for training
tgt_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\tgt_train.txt') # File containing corresponding simplified sentences for training
picto_train = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\picto_id_train.txt') # File containing picto IDs for training
src_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\src_valid.txt') # File containing original sentences for validation
tgt_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\tgt_valid.txt') # File containing corresponding simplified sentences for validation
picto_valid = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\picto_id_valid.txt') # File containing picto IDs for validation
# Now src_train, tgt_train, and picto_train are lists containing the sentences and picto IDs from the files.
# Step 2: Fine-tune the BERT model
# Same as before
# Step 3: Evaluate the model
def evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid):
predicted_sentences = []
true_labels = []
for src_sentence, tgt_sentence, picto_ids in zip(src_valid, tgt_valid, picto_valid):
# Tokenize and get predictions
tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt')
with torch.no_grad():
outputs = model(tokenized_sentence)
predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy()
# Decode predicted sentence
predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True)
# Append to lists
predicted_sentences.append(predicted_sentence)
true_labels.append(tgt_sentence)
# Calculate evaluation metrics based on picto IDs
accuracies = {"100%": 0, "70%": 0, "50%": 0, "20%": 0}
for pred, true, picto_pred, picto_true in zip(predicted_sentences, true_labels, picto_valid, tgt_valid):
if pred == true:
accuracies["100%"] += 1
elif len(pred.split()) == len(picto_pred) == len(picto_true):
match_count = sum(1 for x, y in zip(picto_pred, picto_true) if x == y)
match_percentage = match_count / len(picto_pred)
if match_percentage >= 0.7:
accuracies["70%"] += 1
elif match_percentage >= 0.5:
accuracies["50%"] += 1
elif match_percentage >= 0.2:
accuracies["20%"] += 1
return accuracies
from transformers import CamembertModel, CamembertTokenizer
# You can replace "camembert-base" with any other model from the table, e.g. "camembert/camembert-large".
tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-wikipedia-4gb")
camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb")
accuracies = evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid)
print("Accuracies based on picto IDs:")
print(accuracies)