File size: 7,573 Bytes
817741c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from transformers import BertTokenizer, BertForMaskedLM
import torch
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt

# # Step 1: Prepare the dataset
# # Load your training and validation datasets
# def read_data(file_path):
#     with open(file_path, 'r', encoding='utf-8') as file:
#         data = file.readlines()
#     return data

# src_train = read_data('src_train.txt')  # File containing original sentences for training
# tgt_train = read_data('tgt_train.txt')  # File containing corresponding simplified sentences for training
# src_valid = read_data('src_valid.txt')  # File containing original sentences for validation
# tgt_valid = read_data('tgt_valid.txt')  # File containing corresponding simplified sentences for validation

# # Step 2: Fine-tune the BERT model
# tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-french-wwm-cased')
# model = BertForMaskedLM.from_pretrained('dbmdz/bert-base-french-wwm-cased')

# # Fine-tune the model on your training dataset
# # You need to define the training loop here

# # Step 3: Evaluate the model
# def evaluate_model(model, tokenizer, src_valid, tgt_valid):
#     predicted_sentences = []
#     true_labels = []

#     for src_sentence, tgt_sentence in zip(src_valid, tgt_valid):
#         # Tokenize and get predictions
#         tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt')
#         with torch.no_grad():
#             outputs = model(tokenized_sentence)
#             predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy()

#         # Decode predicted sentence
#         predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True)

#         # Append to lists
#         predicted_sentences.append(predicted_sentence)
#         true_labels.append(tgt_sentence)

#     # Calculate evaluation metrics
#     precision = precision_score(true_labels, predicted_sentences, average='weighted')
#     recall = recall_score(true_labels, predicted_sentences, average='weighted')
#     f1 = f1_score(true_labels, predicted_sentences, average='weighted')

#     # Create confusion matrix
#     labels = np.unique(true_labels)
#     cm = confusion_matrix(true_labels, predicted_sentences, labels=labels)

#     return precision, recall, f1, cm

# precision, recall, f1, confusion_matrix = evaluate_model(model, tokenizer, src_valid, tgt_valid)
# print("Precision:", precision)
# print("Recall:", recall)
# print("F1 Score:", f1)
# print("Confusion Matrix:")
# print(confusion_matrix)

# # Step 4: Analyze the results
# # Count the number of sentences with perfect matches (>70% match, >50% match, <20% match)

# def match_percentage(sentence1, sentence2):
#     n = len(sentence1)
#     if n == 0:
#         return 0.0
#     common = sum([1 for x, y in zip(sentence1, sentence2) if x == y])
#     return common / n

# matches_70 = 0
# matches_50 = 0
# matches_20 = 0
# for pred, true in zip(predicted_sentences, tgt_valid):
#     percentage = match_percentage(pred, true)
#     if percentage > 0.7:
#         matches_70 += 1
#     if percentage > 0.5:
#         matches_50 += 1
#     if percentage < 0.2:
#         matches_20 += 1

# print("Number of sentences with >70% match:", matches_70)
# print("Number of sentences with >50% match:", matches_50)
# print("Number of sentences with <20% match:", matches_20)

# # Save confusion matrix as image
# plt.figure(figsize=(8, 6))
# plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.colorbar()
# tick_marks = np.arange(len(labels))
# plt.xticks(tick_marks, labels, rotation=45)
# plt.yticks(tick_marks, labels)
# plt.xlabel('Predicted Label')
# plt.ylabel('True Label')
# plt.tight_layout()
# plt.savefig('confusion_matrix.png')


# Step 1: Prepare the dataset
# Load your training and validation datasets
def read_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()
    return data

def read_picto_ids(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()
        picto_ids = [list(map(int, line.split())) for line in data]
    return picto_ids

src_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\src_train.txt')  # File containing original sentences for training
tgt_train = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\tgt_train.txt')  # File containing corresponding simplified sentences for training
picto_train = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\train_files\picto_id_train.txt')  # File containing picto IDs for training

src_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\src_valid.txt')  # File containing original sentences for validation
tgt_valid = read_data(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\tgt_valid.txt')  # File containing corresponding simplified sentences for validation
picto_valid = read_picto_ids(r'C:\Users\LENOVO\Downloads\ToPicto\data\Text-to-Picto\valid_files\picto_id_valid.txt')  # File containing picto IDs for validation

# Now src_train, tgt_train, and picto_train are lists containing the sentences and picto IDs from the files.

# Step 2: Fine-tune the BERT model
# Same as before

# Step 3: Evaluate the model
def evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid):
    predicted_sentences = []
    true_labels = []

    for src_sentence, tgt_sentence, picto_ids in zip(src_valid, tgt_valid, picto_valid):
        # Tokenize and get predictions
        tokenized_sentence = tokenizer.encode(src_sentence, return_tensors='pt')
        with torch.no_grad():
            outputs = model(tokenized_sentence)
            predictions = outputs.logits[0].argmax(dim=-1).cpu().numpy()

        # Decode predicted sentence
        predicted_sentence = tokenizer.decode(predictions, skip_special_tokens=True)

        # Append to lists
        predicted_sentences.append(predicted_sentence)
        true_labels.append(tgt_sentence)

    # Calculate evaluation metrics based on picto IDs
    accuracies = {"100%": 0, "70%": 0, "50%": 0, "20%": 0}
    for pred, true, picto_pred, picto_true in zip(predicted_sentences, true_labels, picto_valid, tgt_valid):
        if pred == true:
            accuracies["100%"] += 1
        elif len(pred.split()) == len(picto_pred) == len(picto_true):
            match_count = sum(1 for x, y in zip(picto_pred, picto_true) if x == y)
            match_percentage = match_count / len(picto_pred)
            if match_percentage >= 0.7:
                accuracies["70%"] += 1
            elif match_percentage >= 0.5:
                accuracies["50%"] += 1
            elif match_percentage >= 0.2:
                accuracies["20%"] += 1

    return accuracies
from transformers import CamembertModel, CamembertTokenizer

# You can replace "camembert-base" with any other model from the table, e.g. "camembert/camembert-large".
tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base-wikipedia-4gb")
camembert = CamembertModel.from_pretrained("camembert/camembert-base-wikipedia-4gb")

accuracies = evaluate_model(model, tokenizer, src_valid, tgt_valid, picto_valid)
print("Accuracies based on picto IDs:")
print(accuracies)