Spaces:

DracoMaster
/

topicto

Runtime error

File size: 7,573 Bytes

817741c

from transformers import BertTokenizer, BertForMaskedLM
import torch
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt

# # Step 1: Prepare the dataset
# # Load your training and validation datasets
# def read_data(file_path):
#     with open(file_path, 'r', encoding='utf-8') as file:
#         data = file.readlines()
#     return data

# src_train = read_data('src_train.txt')  # File containing original sentences for training
# tgt_train = read_data('tgt_train.txt')  # File containing corresponding simplified sentences for training
# src_valid = read_data('src_valid.txt')  # File containing original sentences for validation
# tgt_valid = read_data('tgt_valid.txt')  # File containing corresponding simplified sentences for validation

# # Step 2: Fine-tune the BERT model
# tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-french-wwm-cased')
# model = BertForMaskedLM.from_pretrained('dbmdz/bert-base-french-wwm-cased')

# # Fine-tune the model on your training dataset
# # You need to define the training loop here

# # Step 3: Evaluate the model
# def evaluate_model(model, tokenizer, src_valid, tgt_valid):
#     predicted_sentences = []
#     true_labels = []

#     for src_sentence, tgt_sentence in zip(src_valid, tgt_valid):
#         # Tokenize and get predictions
#         tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt')
#         with torch.no_grad():
#             outputs = model(tokenized_sentence)
#             predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy()

#         # Decode predicted sentence
#         predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True)

#         # Append to lists
#         predicted_sentences.append(predicted_sentence)
#         true_labels.append(tgt_sentence)

#     # Calculate evaluation metrics
#     precision = precision_score(true_labels, predicted_sentences, average='weighted')
#     recall = recall_score(true_labels, predicted_sentences, average='weighted')
#     f1 = f1_score(true_labels, predicted_sentences, average='weighted')

#     # Create confusion matrix
#     labels = np.unique(true_labels)
#     cm = confusion_matrix(true_labels, predicted_sentences, labels=labels)

#     return precision, recall, f1, cm

# precision, recall, f1, confusion_matrix = evaluate_model(model, tokenizer, src_valid, tgt_valid)
# print("Precision:", precision)
# print("Recall:", recall)
# print("F1 Score:", f1)
# print("Confusion Matrix:")
# print(confusion_matrix)

# # Step 4: Analyze the results
# # Count the number of sentences with perfect matches (>70% match, >50% match, <20% match)

# def match_percentage(sentence1, sentence2):
#     n = len(sentence1)
#     if n == 0:
#         return 0.0
#     common = sum([1 for x, y in zip(sentence1, sentence2) if x == y])
#     return common / n

# matches_70 = 0
# matches_50 = 0
# matches_20 = 0
# for pred, true in zip(predicted_sentences, tgt_valid):
#     percentage = match_percentage(pred, true)
#     if percentage > 0.7:
#         matches_70 += 1
#     if percentage > 0.5:
#         matches_50 += 1
#     if percentage < 0.2:
#         matches_20 += 1

# print("Number of sentences with >70% match:", matches_70)
# print("Number of sentences with >50% match:", matches_50)
# print("Number of sentences with <20% match:", matches_20)

# # Save confusion matrix as image
# plt.figure(figsize=(8, 6))
# plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.colorbar()
# tick_marks = np.arange(len(labels))
# plt.xticks(tick_marks, labels, rotation=45)
# plt.yticks(tick_marks, labels)
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.tight_layout()
# plt.savefig('confusion_matrix.png')


# Step 1: Prepare the dataset
# Load your training and validation datasets
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()
    return data

def read_picto_ids(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()
        picto_ids = [list(map(int, line.split())) for line in data]
    return picto_ids

src_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\src_train.txt')  # File containing original sentences for training
tgt_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\tgt_train.txt')  # File containing corresponding simplified sentences for training
picto_train = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\picto_id_train.txt')  # File containing picto IDs for training

src_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\src_valid.txt')  # File containing original sentences for validation
tgt_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\tgt_valid.txt')  # File containing corresponding simplified sentences for validation
picto_valid = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\picto_id_valid.txt')  # File containing picto IDs for validation

# Now src_train, tgt_train, and picto_train are lists containing the sentences and picto IDs from the files.

# Step 2: Fine-tune the BERT model
# Same as before

# Step 3: Evaluate the model
def evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid):
    predicted_sentences = []
    true_labels = []

    for src_sentence, tgt_sentence, picto_ids in zip(src_valid, tgt_valid, picto_valid):
        # Tokenize and get predictions
        tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt')
        with torch.no_grad():
            outputs = model(tokenized_sentence)
            predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy()

        # Decode predicted sentence
        predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True)

        # Append to lists
        predicted_sentences.append(predicted_sentence)
        true_labels.append(tgt_sentence)

    # Calculate evaluation metrics based on picto IDs
    accuracies = {"100%": 0, "70%": 0, "50%": 0, "20%": 0}
    for pred, true, picto_pred, picto_true in zip(predicted_sentences, true_labels, picto_valid, tgt_valid):
        if pred == true:
            accuracies["100%"] += 1
        elif len(pred.split()) == len(picto_pred) == len(picto_true):
            match_count = sum(1 for x, y in zip(picto_pred, picto_true) if x == y)
            match_percentage = match_count / len(picto_pred)
            if match_percentage >= 0.7:
                accuracies["70%"] += 1
            elif match_percentage >= 0.5:
                accuracies["50%"] += 1
            elif match_percentage >= 0.2:
                accuracies["20%"] += 1

    return accuracies
from transformers import CamembertModel, CamembertTokenizer

# You can replace "camembert-base" with any other model from the table, e.g. "camembert/camembert-large".
tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-wikipedia-4gb")
camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb")

accuracies = evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid)
print("Accuracies based on picto IDs:")
print(accuracies)