Spaces:
Runtime error
Runtime error
| from transformers import BertTokenizer, BertForMaskedLM | |
| import torch | |
| from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| # # Step 1: Prepare the dataset | |
| # # Load your training and validation datasets | |
| # def read_data(file_path): | |
| # with open(file_path, 'r', encoding='utf-8') as file: | |
| # data = file.readlines() | |
| # return data | |
| # src_train = read_data('src_train.txt') # File containing original sentences for training | |
| # tgt_train = read_data('tgt_train.txt') # File containing corresponding simplified sentences for training | |
| # src_valid = read_data('src_valid.txt') # File containing original sentences for validation | |
| # tgt_valid = read_data('tgt_valid.txt') # File containing corresponding simplified sentences for validation | |
| # # Step 2: Fine-tune the BERT model | |
| # tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-french-wwm-cased') | |
| # model = BertForMaskedLM.from_pretrained('dbmdz/bert-base-french-wwm-cased') | |
| # # Fine-tune the model on your training dataset | |
| # # You need to define the training loop here | |
| # # Step 3: Evaluate the model | |
| # def evaluate_model(model, tokenizer, src_valid, tgt_valid): | |
| # predicted_sentences = [] | |
| # true_labels = [] | |
| # for src_sentence, tgt_sentence in zip(src_valid, tgt_valid): | |
| # # Tokenize and get predictions | |
| # tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt') | |
| # with torch.no_grad(): | |
| # outputs = model(tokenized_sentence) | |
| # predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy() | |
| # # Decode predicted sentence | |
| # predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True) | |
| # # Append to lists | |
| # predicted_sentences.append(predicted_sentence) | |
| # true_labels.append(tgt_sentence) | |
| # # Calculate evaluation metrics | |
| # precision = precision_score(true_labels, predicted_sentences, average='weighted') | |
| # recall = recall_score(true_labels, predicted_sentences, average='weighted') | |
| # f1 = f1_score(true_labels, predicted_sentences, average='weighted') | |
| # # Create confusion matrix | |
| # labels = np.unique(true_labels) | |
| # cm = confusion_matrix(true_labels, predicted_sentences, labels=labels) | |
| # return precision, recall, f1, cm | |
| # precision, recall, f1, confusion_matrix = evaluate_model(model, tokenizer, src_valid, tgt_valid) | |
| # print("Precision:", precision) | |
| # print("Recall:", recall) | |
| # print("F1 Score:", f1) | |
| # print("Confusion Matrix:") | |
| # print(confusion_matrix) | |
| # # Step 4: Analyze the results | |
| # # Count the number of sentences with perfect matches (>70% match, >50% match, <20% match) | |
| # def match_percentage(sentence1, sentence2): | |
| # n = len(sentence1) | |
| # if n == 0: | |
| # return 0.0 | |
| # common = sum([1 for x, y in zip(sentence1, sentence2) if x == y]) | |
| # return common / n | |
| # matches_70 = 0 | |
| # matches_50 = 0 | |
| # matches_20 = 0 | |
| # for pred, true in zip(predicted_sentences, tgt_valid): | |
| # percentage = match_percentage(pred, true) | |
| # if percentage > 0.7: | |
| # matches_70 += 1 | |
| # if percentage > 0.5: | |
| # matches_50 += 1 | |
| # if percentage < 0.2: | |
| # matches_20 += 1 | |
| # print("Number of sentences with >70% match:", matches_70) | |
| # print("Number of sentences with >50% match:", matches_50) | |
| # print("Number of sentences with <20% match:", matches_20) | |
| # # Save confusion matrix as image | |
| # plt.figure(figsize=(8, 6)) | |
| # plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues) | |
| # plt.title('Confusion Matrix') | |
| # plt.colorbar() | |
| # tick_marks = np.arange(len(labels)) | |
| # plt.xticks(tick_marks, labels, rotation=45) | |
| # plt.yticks(tick_marks, labels) | |
| # plt.xlabel('Predicted Label') | |
| # plt.ylabel('True Label') | |
| # plt.tight_layout() | |
| # plt.savefig('confusion_matrix.png') | |
| # Step 1: Prepare the dataset | |
| # Load your training and validation datasets | |
| def read_data(file_path): | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| data = file.readlines() | |
| return data | |
| def read_picto_ids(file_path): | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| data = file.readlines() | |
| picto_ids = [list(map(int, line.split())) for line in data] | |
| return picto_ids | |
| src_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\src_train.txt') # File containing original sentences for training | |
| tgt_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\tgt_train.txt') # File containing corresponding simplified sentences for training | |
| picto_train = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\picto_id_train.txt') # File containing picto IDs for training | |
| src_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\src_valid.txt') # File containing original sentences for validation | |
| tgt_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\tgt_valid.txt') # File containing corresponding simplified sentences for validation | |
| picto_valid = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\picto_id_valid.txt') # File containing picto IDs for validation | |
| # Now src_train, tgt_train, and picto_train are lists containing the sentences and picto IDs from the files. | |
| # Step 2: Fine-tune the BERT model | |
| # Same as before | |
| # Step 3: Evaluate the model | |
| def evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid): | |
| predicted_sentences = [] | |
| true_labels = [] | |
| for src_sentence, tgt_sentence, picto_ids in zip(src_valid, tgt_valid, picto_valid): | |
| # Tokenize and get predictions | |
| tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt') | |
| with torch.no_grad(): | |
| outputs = model(tokenized_sentence) | |
| predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy() | |
| # Decode predicted sentence | |
| predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True) | |
| # Append to lists | |
| predicted_sentences.append(predicted_sentence) | |
| true_labels.append(tgt_sentence) | |
| # Calculate evaluation metrics based on picto IDs | |
| accuracies = {"100%": 0, "70%": 0, "50%": 0, "20%": 0} | |
| for pred, true, picto_pred, picto_true in zip(predicted_sentences, true_labels, picto_valid, tgt_valid): | |
| if pred == true: | |
| accuracies["100%"] += 1 | |
| elif len(pred.split()) == len(picto_pred) == len(picto_true): | |
| match_count = sum(1 for x, y in zip(picto_pred, picto_true) if x == y) | |
| match_percentage = match_count / len(picto_pred) | |
| if match_percentage >= 0.7: | |
| accuracies["70%"] += 1 | |
| elif match_percentage >= 0.5: | |
| accuracies["50%"] += 1 | |
| elif match_percentage >= 0.2: | |
| accuracies["20%"] += 1 | |
| return accuracies | |
| from transformers import CamembertModel, CamembertTokenizer | |
| # You can replace "camembert-base" with any other model from the table, e.g. "camembert/camembert-large". | |
| tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-wikipedia-4gb") | |
| camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb") | |
| accuracies = evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid) | |
| print("Accuracies based on picto IDs:") | |
| print(accuracies) | |