Spaces:
Runtime error
Runtime error
from transformers import BertTokenizer, BertForMaskedLM | |
import torch | |
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score | |
import numpy as np | |
import matplotlib.pyplot as plt | |
# # Step 1: Prepare the dataset | |
# # Load your training and validation datasets | |
# def read_data(file_path): | |
# with open(file_path, 'r', encoding='utf-8') as file: | |
# data = file.readlines() | |
# return data | |
# src_train = read_data('src_train.txt') # File containing original sentences for training | |
# tgt_train = read_data('tgt_train.txt') # File containing corresponding simplified sentences for training | |
# src_valid = read_data('src_valid.txt') # File containing original sentences for validation | |
# tgt_valid = read_data('tgt_valid.txt') # File containing corresponding simplified sentences for validation | |
# # Step 2: Fine-tune the BERT model | |
# tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-french-wwm-cased') | |
# model = BertForMaskedLM.from_pretrained('dbmdz/bert-base-french-wwm-cased') | |
# # Fine-tune the model on your training dataset | |
# # You need to define the training loop here | |
# # Step 3: Evaluate the model | |
# def evaluate_model(model, tokenizer, src_valid, tgt_valid): | |
# predicted_sentences = [] | |
# true_labels = [] | |
# for src_sentence, tgt_sentence in zip(src_valid, tgt_valid): | |
# # Tokenize and get predictions | |
# tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt') | |
# with torch.no_grad(): | |
# outputs = model(tokenized_sentence) | |
# predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy() | |
# # Decode predicted sentence | |
# predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True) | |
# # Append to lists | |
# predicted_sentences.append(predicted_sentence) | |
# true_labels.append(tgt_sentence) | |
# # Calculate evaluation metrics | |
# precision = precision_score(true_labels, predicted_sentences, average='weighted') | |
# recall = recall_score(true_labels, predicted_sentences, average='weighted') | |
# f1 = f1_score(true_labels, predicted_sentences, average='weighted') | |
# # Create confusion matrix | |
# labels = np.unique(true_labels) | |
# cm = confusion_matrix(true_labels, predicted_sentences, labels=labels) | |
# return precision, recall, f1, cm | |
# precision, recall, f1, confusion_matrix = evaluate_model(model, tokenizer, src_valid, tgt_valid) | |
# print("Precision:", precision) | |
# print("Recall:", recall) | |
# print("F1 Score:", f1) | |
# print("Confusion Matrix:") | |
# print(confusion_matrix) | |
# # Step 4: Analyze the results | |
# # Count the number of sentences with perfect matches (>70% match, >50% match, <20% match) | |
# def match_percentage(sentence1, sentence2): | |
# n = len(sentence1) | |
# if n == 0: | |
# return 0.0 | |
# common = sum([1 for x, y in zip(sentence1, sentence2) if x == y]) | |
# return common / n | |
# matches_70 = 0 | |
# matches_50 = 0 | |
# matches_20 = 0 | |
# for pred, true in zip(predicted_sentences, tgt_valid): | |
# percentage = match_percentage(pred, true) | |
# if percentage > 0.7: | |
# matches_70 += 1 | |
# if percentage > 0.5: | |
# matches_50 += 1 | |
# if percentage < 0.2: | |
# matches_20 += 1 | |
# print("Number of sentences with >70% match:", matches_70) | |
# print("Number of sentences with >50% match:", matches_50) | |
# print("Number of sentences with <20% match:", matches_20) | |
# # Save confusion matrix as image | |
# plt.figure(figsize=(8, 6)) | |
# plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues) | |
# plt.title('Confusion Matrix') | |
# plt.colorbar() | |
# tick_marks = np.arange(len(labels)) | |
# plt.xticks(tick_marks, labels, rotation=45) | |
# plt.yticks(tick_marks, labels) | |
# plt.xlabel('Predicted Label') | |
# plt.ylabel('True Label') | |
# plt.tight_layout() | |
# plt.savefig('confusion_matrix.png') | |
# Step 1: Prepare the dataset | |
# Load your training and validation datasets | |
def read_data(file_path): | |
with open(file_path, 'r', encoding='utf-8') as file: | |
data = file.readlines() | |
return data | |
def read_picto_ids(file_path): | |
with open(file_path, 'r', encoding='utf-8') as file: | |
data = file.readlines() | |
picto_ids = [list(map(int, line.split())) for line in data] | |
return picto_ids | |
src_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\src_train.txt') # File containing original sentences for training | |
tgt_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\tgt_train.txt') # File containing corresponding simplified sentences for training | |
picto_train = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\picto_id_train.txt') # File containing picto IDs for training | |
src_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\src_valid.txt') # File containing original sentences for validation | |
tgt_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\tgt_valid.txt') # File containing corresponding simplified sentences for validation | |
picto_valid = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\picto_id_valid.txt') # File containing picto IDs for validation | |
# Now src_train, tgt_train, and picto_train are lists containing the sentences and picto IDs from the files. | |
# Step 2: Fine-tune the BERT model | |
# Same as before | |
# Step 3: Evaluate the model | |
def evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid): | |
predicted_sentences = [] | |
true_labels = [] | |
for src_sentence, tgt_sentence, picto_ids in zip(src_valid, tgt_valid, picto_valid): | |
# Tokenize and get predictions | |
tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt') | |
with torch.no_grad(): | |
outputs = model(tokenized_sentence) | |
predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy() | |
# Decode predicted sentence | |
predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True) | |
# Append to lists | |
predicted_sentences.append(predicted_sentence) | |
true_labels.append(tgt_sentence) | |
# Calculate evaluation metrics based on picto IDs | |
accuracies = {"100%": 0, "70%": 0, "50%": 0, "20%": 0} | |
for pred, true, picto_pred, picto_true in zip(predicted_sentences, true_labels, picto_valid, tgt_valid): | |
if pred == true: | |
accuracies["100%"] += 1 | |
elif len(pred.split()) == len(picto_pred) == len(picto_true): | |
match_count = sum(1 for x, y in zip(picto_pred, picto_true) if x == y) | |
match_percentage = match_count / len(picto_pred) | |
if match_percentage >= 0.7: | |
accuracies["70%"] += 1 | |
elif match_percentage >= 0.5: | |
accuracies["50%"] += 1 | |
elif match_percentage >= 0.2: | |
accuracies["20%"] += 1 | |
return accuracies | |
from transformers import CamembertModel, CamembertTokenizer | |
# You can replace "camembert-base" with any other model from the table, e.g. "camembert/camembert-large". | |
tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-wikipedia-4gb") | |
camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb") | |
accuracies = evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid) | |
print("Accuracies based on picto IDs:") | |
print(accuracies) | |