Spaces:
Runtime error
Runtime error
Mise en place une classe dataset fonctionnelle
Browse files- src/dataloader.py +56 -37
- src/inference.py +8 -7
- src/model.py +40 -38
- src/script.py +0 -90
- src/train.py +74 -6
src/dataloader.py
CHANGED
@@ -11,17 +11,15 @@
|
|
11 |
Création d'un Vectoriserà partir du vocabulaire :
|
12 |
|
13 |
"""
|
|
|
14 |
import string
|
15 |
from collections import Counter
|
16 |
|
17 |
import pandas as pd
|
18 |
import torch
|
19 |
-
from nltk import word_tokenize
|
20 |
|
21 |
-
# nltk.download('punkt')
|
22 |
|
23 |
-
|
24 |
-
class Data:
|
25 |
"""
|
26 |
A class used to get data from file
|
27 |
...
|
@@ -44,8 +42,27 @@ class Data:
|
|
44 |
create a dataset with cleaned data
|
45 |
"""
|
46 |
|
47 |
-
def __init__(self, path: str) -> None:
|
48 |
self.path = path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
def open(self) -> pd.DataFrame:
|
51 |
"""
|
@@ -85,26 +102,13 @@ class Data:
|
|
85 |
# - s'occuper des noms propres (mots commençant par une majuscule qui se suivent)
|
86 |
for text in texts:
|
87 |
text = text.translate(str.maketrans("", "", string.punctuation))
|
88 |
-
text =
|
89 |
tokenized_texts.append(text)
|
90 |
|
91 |
if text_type == "summary":
|
92 |
return [["<start>", *summary, "<end>"] for summary in tokenized_texts]
|
93 |
return tokenized_texts
|
94 |
|
95 |
-
def pad_sequence(self):
|
96 |
-
"""
|
97 |
-
pad summary with empty token
|
98 |
-
"""
|
99 |
-
texts = self.clean_data("text")
|
100 |
-
summaries = self.clean_data("summary")
|
101 |
-
padded_summary = []
|
102 |
-
for text, summary in zip(texts, summaries):
|
103 |
-
if len(summary) != len(text):
|
104 |
-
summary += ["<empty>"] * (len(text) - len(summary))
|
105 |
-
padded_summary.append(summary)
|
106 |
-
return texts, padded_summary
|
107 |
-
|
108 |
def get_words(self) -> list:
|
109 |
"""
|
110 |
Create a dictionnary of the data vocabulary
|
@@ -114,15 +118,20 @@ class Data:
|
|
114 |
summary_words = [word for text in summaries for word in text]
|
115 |
return text_words + summary_words
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
|
128 |
class Vectoriser:
|
@@ -146,12 +155,25 @@ class Vectoriser:
|
|
146 |
encode an entire row from the dataset
|
147 |
"""
|
148 |
|
149 |
-
def __init__(self, vocab) -> None:
|
150 |
self.vocab = vocab
|
151 |
self.word_count = Counter(word.lower().strip(",.\\-") for word in self.vocab)
|
152 |
self.idx_to_token = sorted([t for t, c in self.word_count.items() if c > 1])
|
153 |
self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
def encode(self, tokens) -> torch.tensor:
|
156 |
"""
|
157 |
Encode une phrase selon les mots qu'elle contient
|
@@ -165,7 +187,7 @@ class Vectoriser:
|
|
165 |
:return: words_idx : tensor
|
166 |
Un tensor contenant les index des mots de la phrase
|
167 |
"""
|
168 |
-
if
|
169 |
words_idx = torch.tensor(
|
170 |
[
|
171 |
self.token_to_idx.get(t.lower(), len(self.token_to_idx))
|
@@ -175,7 +197,7 @@ class Vectoriser:
|
|
175 |
)
|
176 |
|
177 |
# Permet d'encoder mots par mots
|
178 |
-
elif
|
179 |
words_idx = torch.tensor(self.token_to_idx.get(tokens.lower()))
|
180 |
|
181 |
return words_idx
|
@@ -184,9 +206,9 @@ class Vectoriser:
|
|
184 |
"""
|
185 |
Decode une phrase selon le procédé inverse que la fonction encode
|
186 |
"""
|
187 |
-
|
188 |
idxs = words_idx_tensor.tolist()
|
189 |
-
if
|
190 |
words = [self.idx_to_token[idxs]]
|
191 |
else:
|
192 |
words = []
|
@@ -195,10 +217,7 @@ class Vectoriser:
|
|
195 |
words.append(self.idx_to_token[idx])
|
196 |
return words
|
197 |
|
198 |
-
def
|
199 |
-
pass
|
200 |
-
|
201 |
-
def vectorize(self, row) -> torch.tensor:
|
202 |
"""
|
203 |
Encode les données d'une ligne du dataframe
|
204 |
----------
|
|
|
11 |
Création d'un Vectoriserà partir du vocabulaire :
|
12 |
|
13 |
"""
|
14 |
+
import pickle
|
15 |
import string
|
16 |
from collections import Counter
|
17 |
|
18 |
import pandas as pd
|
19 |
import torch
|
|
|
20 |
|
|
|
21 |
|
22 |
+
class Data(torch.utils.data.Dataset):
|
|
|
23 |
"""
|
24 |
A class used to get data from file
|
25 |
...
|
|
|
42 |
create a dataset with cleaned data
|
43 |
"""
|
44 |
|
45 |
+
def __init__(self, path: str, transform=None) -> None:
|
46 |
self.path = path
|
47 |
+
self.data = pd.read_json(path_or_buf=self.path, lines=True)
|
48 |
+
self.transform = transform
|
49 |
+
|
50 |
+
def __len__(self):
|
51 |
+
return len(self.data)
|
52 |
+
|
53 |
+
def __getitem__(self, idx):
|
54 |
+
row = self.data.iloc[idx]
|
55 |
+
text = row["text"].translate(str.maketrans("", "", string.punctuation)).split()
|
56 |
+
summary = (
|
57 |
+
row["summary"].translate(str.maketrans("", "", string.punctuation)).split()
|
58 |
+
)
|
59 |
+
summary = ["<start>", *summary, "<end>"]
|
60 |
+
sample = {"text": text, "summary": summary}
|
61 |
+
|
62 |
+
if self.transform:
|
63 |
+
sample = self.transform(sample)
|
64 |
+
|
65 |
+
return sample
|
66 |
|
67 |
def open(self) -> pd.DataFrame:
|
68 |
"""
|
|
|
102 |
# - s'occuper des noms propres (mots commençant par une majuscule qui se suivent)
|
103 |
for text in texts:
|
104 |
text = text.translate(str.maketrans("", "", string.punctuation))
|
105 |
+
text = text.split()
|
106 |
tokenized_texts.append(text)
|
107 |
|
108 |
if text_type == "summary":
|
109 |
return [["<start>", *summary, "<end>"] for summary in tokenized_texts]
|
110 |
return tokenized_texts
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
def get_words(self) -> list:
|
113 |
"""
|
114 |
Create a dictionnary of the data vocabulary
|
|
|
118 |
summary_words = [word for text in summaries for word in text]
|
119 |
return text_words + summary_words
|
120 |
|
121 |
+
|
122 |
+
def pad_collate(data):
|
123 |
+
text_batch = [element[0] for element in data]
|
124 |
+
summary_batch = [element[1] for element in data]
|
125 |
+
max_len = max([len(element) for element in summary_batch + text_batch])
|
126 |
+
text_batch = [
|
127 |
+
torch.nn.functional.pad(element, (0, max_len - len(element)), value=-100)
|
128 |
+
for element in text_batch
|
129 |
+
]
|
130 |
+
summary_batch = [
|
131 |
+
torch.nn.functional.pad(element, (0, max_len - len(element)), value=-100)
|
132 |
+
for element in summary_batch
|
133 |
+
]
|
134 |
+
return text_batch, summary_batch
|
135 |
|
136 |
|
137 |
class Vectoriser:
|
|
|
155 |
encode an entire row from the dataset
|
156 |
"""
|
157 |
|
158 |
+
def __init__(self, vocab=None) -> None:
|
159 |
self.vocab = vocab
|
160 |
self.word_count = Counter(word.lower().strip(",.\\-") for word in self.vocab)
|
161 |
self.idx_to_token = sorted([t for t, c in self.word_count.items() if c > 1])
|
162 |
self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}
|
163 |
|
164 |
+
def load(self, path):
|
165 |
+
with open(path, "rb") as file:
|
166 |
+
self.vocab = pickle.load(file)
|
167 |
+
self.word_count = Counter(
|
168 |
+
word.lower().strip(",.\\-") for word in self.vocab
|
169 |
+
)
|
170 |
+
self.idx_to_token = sorted([t for t, c in self.word_count.items() if c > 1])
|
171 |
+
self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}
|
172 |
+
|
173 |
+
def save(self, path):
|
174 |
+
with open(path, "wb") as file:
|
175 |
+
pickle.dump(self.vocab, file)
|
176 |
+
|
177 |
def encode(self, tokens) -> torch.tensor:
|
178 |
"""
|
179 |
Encode une phrase selon les mots qu'elle contient
|
|
|
187 |
:return: words_idx : tensor
|
188 |
Un tensor contenant les index des mots de la phrase
|
189 |
"""
|
190 |
+
if isinstance(tokens, list):
|
191 |
words_idx = torch.tensor(
|
192 |
[
|
193 |
self.token_to_idx.get(t.lower(), len(self.token_to_idx))
|
|
|
197 |
)
|
198 |
|
199 |
# Permet d'encoder mots par mots
|
200 |
+
elif isinstance(tokens, str):
|
201 |
words_idx = torch.tensor(self.token_to_idx.get(tokens.lower()))
|
202 |
|
203 |
return words_idx
|
|
|
206 |
"""
|
207 |
Decode une phrase selon le procédé inverse que la fonction encode
|
208 |
"""
|
209 |
+
|
210 |
idxs = words_idx_tensor.tolist()
|
211 |
+
if isinstance(idxs, int):
|
212 |
words = [self.idx_to_token[idxs]]
|
213 |
else:
|
214 |
words = []
|
|
|
217 |
words.append(self.idx_to_token[idx])
|
218 |
return words
|
219 |
|
220 |
+
def __call__(self, row) -> torch.tensor:
|
|
|
|
|
|
|
221 |
"""
|
222 |
Encode les données d'une ligne du dataframe
|
223 |
----------
|
src/inference.py
CHANGED
@@ -1,21 +1,22 @@
|
|
1 |
"""
|
2 |
Allows to predict the summary for a given entry text
|
3 |
"""
|
|
|
|
|
4 |
import torch
|
5 |
-
from nltk import word_tokenize
|
6 |
|
7 |
-
|
8 |
-
from
|
9 |
|
10 |
# On doit loader les données pour avoir le Vectoriser > sauvegarder "words" dans un fichiers et le loader par la suite ??
|
11 |
### À CHANGER POUR N'AVOIR À LOADER QUE LE VECTORISER
|
12 |
data1 = dataloader.Data("data/train_extract.jsonl")
|
13 |
data2 = dataloader.Data("data/dev_extract.jsonl")
|
14 |
-
|
15 |
-
dev_dataset = data2.make_dataset()
|
16 |
words = data1.get_words()
|
17 |
|
18 |
-
vectoriser = dataloader.Vectoriser(
|
|
|
19 |
word_counts = vectoriser.word_count
|
20 |
|
21 |
|
@@ -30,7 +31,7 @@ def inferenceAPI(text: str) -> str:
|
|
30 |
str
|
31 |
The summary for the input text
|
32 |
"""
|
33 |
-
text =
|
34 |
# On défini les paramètres d'entrée pour le modèle
|
35 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
36 |
encoder = Encoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device).to(
|
|
|
1 |
"""
|
2 |
Allows to predict the summary for a given entry text
|
3 |
"""
|
4 |
+
import pickle
|
5 |
+
|
6 |
import torch
|
|
|
7 |
|
8 |
+
import dataloader
|
9 |
+
from model import Decoder, Encoder, EncoderDecoderModel
|
10 |
|
11 |
# On doit loader les données pour avoir le Vectoriser > sauvegarder "words" dans un fichiers et le loader par la suite ??
|
12 |
### À CHANGER POUR N'AVOIR À LOADER QUE LE VECTORISER
|
13 |
data1 = dataloader.Data("data/train_extract.jsonl")
|
14 |
data2 = dataloader.Data("data/dev_extract.jsonl")
|
15 |
+
words = pickle.load("model/vocab.pkl")
|
|
|
16 |
words = data1.get_words()
|
17 |
|
18 |
+
vectoriser = dataloader.Vectoriser()
|
19 |
+
vectoriser.load("model/vocab.pkl")
|
20 |
word_counts = vectoriser.word_count
|
21 |
|
22 |
|
|
|
31 |
str
|
32 |
The summary for the input text
|
33 |
"""
|
34 |
+
text = text.split()
|
35 |
# On défini les paramètres d'entrée pour le modèle
|
36 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
37 |
encoder = Encoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device).to(
|
src/model.py
CHANGED
@@ -6,14 +6,8 @@ import logging
|
|
6 |
|
7 |
import torch
|
8 |
|
9 |
-
from src import dataloader
|
10 |
-
|
11 |
logging.basicConfig(level=logging.DEBUG)
|
12 |
|
13 |
-
data1 = dataloader.Data("data_extract/train_extract.jsonl")
|
14 |
-
words = data1.get_words()
|
15 |
-
vectoriser = dataloader.Vectoriser(words)
|
16 |
-
|
17 |
|
18 |
class Encoder(torch.nn.Module):
|
19 |
def __init__(
|
@@ -86,51 +80,59 @@ class Decoder(torch.nn.Module):
|
|
86 |
|
87 |
|
88 |
class EncoderDecoderModel(torch.nn.Module):
|
89 |
-
def __init__(self, encoder, decoder, device):
|
90 |
# Une idiosyncrasie de torch, pour qu'iel puisse faire sa magie
|
91 |
super().__init__()
|
92 |
self.encoder = encoder
|
93 |
self.decoder = decoder
|
|
|
94 |
self.device = device
|
95 |
|
96 |
-
def forward(self, source, num_beams=3):
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
# last hidden state of the encoder is used as the initial hidden state of the decoder
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
)
|
117 |
-
cell.to(
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
# first input to the decoder is the <start> token.
|
122 |
-
input = vectoriser.encode("<start>") # Mot de départ du MOdèle
|
123 |
-
input.to(self.device) # idiosyncrasie de torch pour mmettre sur GPU
|
124 |
-
|
125 |
-
### DÉBUT DE L'INSTANCIATION TEST ###
|
126 |
# If you wonder, b stands for better
|
127 |
values = None
|
128 |
b_outputs = torch.zeros(target_len, target_vocab_size).to(self.device)
|
129 |
b_outputs.to(self.device)
|
130 |
|
131 |
-
for i in range(
|
132 |
-
|
133 |
-
): # On va déterminer autant de mot que la taille du texte souhaité
|
134 |
# insert input token embedding, previous hidden and previous cell states
|
135 |
# receive output tensor (predictions) and new hidden and cell states.
|
136 |
|
|
|
6 |
|
7 |
import torch
|
8 |
|
|
|
|
|
9 |
logging.basicConfig(level=logging.DEBUG)
|
10 |
|
|
|
|
|
|
|
|
|
11 |
|
12 |
class Encoder(torch.nn.Module):
|
13 |
def __init__(
|
|
|
80 |
|
81 |
|
82 |
class EncoderDecoderModel(torch.nn.Module):
|
83 |
+
def __init__(self, encoder, decoder, vectoriser, device):
|
84 |
# Une idiosyncrasie de torch, pour qu'iel puisse faire sa magie
|
85 |
super().__init__()
|
86 |
self.encoder = encoder
|
87 |
self.decoder = decoder
|
88 |
+
self.vectoriser = vectoriser
|
89 |
self.device = device
|
90 |
|
91 |
+
def forward(self, source, num_beams=3, summary_len=0.2):
|
92 |
+
"""
|
93 |
+
:param source: tensor
|
94 |
+
the input text
|
95 |
+
:param num_beams: int
|
96 |
+
the number of outputs to iterate on for beam_search
|
97 |
+
:param summary_len: int
|
98 |
+
length ratio of the summary compared to the text
|
99 |
+
"""
|
100 |
+
# The ratio must be inferior to 1 to allow text compression
|
101 |
+
assert summary_len < 1, f"number lesser than 1 expected, got {summary_len}"
|
102 |
+
|
103 |
+
target_len = int(
|
104 |
+
summary_len * source.shape[0]
|
105 |
+
) # Expected summary length (in words)
|
106 |
+
target_vocab_size = self.decoder.vocab_size # Word Embedding length
|
107 |
+
|
108 |
+
# Output of the right format (expected summmary length x word embedding length)
|
109 |
+
# filled with zeros. On each iteration, we will replace one of the row of this
|
110 |
+
# matrix with the choosen word embedding
|
111 |
+
outputs = torch.zeros(target_len, target_vocab_size)
|
112 |
+
|
113 |
+
# put the tensors on the device (useless if CPU bus very useful in case of GPU)
|
114 |
+
outputs.to(self.device)
|
115 |
+
source.to(self.device)
|
116 |
|
117 |
# last hidden state of the encoder is used as the initial hidden state of the decoder
|
118 |
+
hidden, cell = self.encoder(source) # Encode the input text
|
119 |
+
input = self.vectoriser.encode(
|
120 |
+
"<start>"
|
121 |
+
) # Encode the first word of the summary
|
122 |
+
|
123 |
+
# put the tensors on the device
|
124 |
+
hidden.to(self.device)
|
125 |
+
cell.to(self.device)
|
126 |
+
input.to(self.device)
|
127 |
+
|
128 |
+
### BEAM SEARCH ###
|
|
|
|
|
|
|
|
|
|
|
129 |
# If you wonder, b stands for better
|
130 |
values = None
|
131 |
b_outputs = torch.zeros(target_len, target_vocab_size).to(self.device)
|
132 |
b_outputs.to(self.device)
|
133 |
|
134 |
+
for i in range(1, target_len):
|
135 |
+
# On va déterminer autant de mot que la taille du texte souhaité
|
|
|
136 |
# insert input token embedding, previous hidden and previous cell states
|
137 |
# receive output tensor (predictions) and new hidden and cell states.
|
138 |
|
src/script.py
DELETED
@@ -1,90 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
DONE :
|
3 |
-
- Separer la partie vectoriser du Classifeur
|
4 |
-
- Ajouter un LSTM au Classifieur
|
5 |
-
- entrainer le Classifieur
|
6 |
-
TO DO :
|
7 |
-
- Améliorer les résultats du modèle
|
8 |
-
"""
|
9 |
-
import logging
|
10 |
-
import random
|
11 |
-
from typing import Sequence
|
12 |
-
|
13 |
-
import torch
|
14 |
-
|
15 |
-
import dataloader
|
16 |
-
from model import Decoder, Encoder, EncoderDecoderModel
|
17 |
-
from train import train_network
|
18 |
-
|
19 |
-
# logging INFO, WARNING, ERROR, CRITICAL, DEBUG
|
20 |
-
logging.basicConfig(level=logging.INFO)
|
21 |
-
logging.disable(level=10)
|
22 |
-
|
23 |
-
import os
|
24 |
-
|
25 |
-
os.environ[
|
26 |
-
"CUBLAS_WORKSPACE_CONFIG"
|
27 |
-
] = ":16:8" # pour que ça marche en deterministe sur mon pc boulot
|
28 |
-
# variable environnement dans git bash export CUBLAS_WORKSPACE_CONFIG=:16:8
|
29 |
-
# from datasets import load_dataset
|
30 |
-
|
31 |
-
### OPEN DATASET###
|
32 |
-
# dataset = load_dataset("newsroom", data_dir=DATA_PATH, data_files="data/train.jsonl")
|
33 |
-
|
34 |
-
data1 = dataloader.Data("data/train_extract.jsonl")
|
35 |
-
data2 = dataloader.Data("data/dev_extract.jsonl")
|
36 |
-
train_dataset = data1.make_dataset()
|
37 |
-
dev_dataset = data2.make_dataset()
|
38 |
-
words = data1.get_words()
|
39 |
-
|
40 |
-
vectoriser = dataloader.Vectoriser(words)
|
41 |
-
word_counts = vectoriser.word_count
|
42 |
-
|
43 |
-
|
44 |
-
def predict(model, tokens: Sequence[str]) -> Sequence[str]:
|
45 |
-
"""Predict the POS for a tokenized sequence"""
|
46 |
-
words_idx = vectoriser.encode(tokens).to(device)
|
47 |
-
# Pas de calcul de gradient ici : c'est juste pour les prédictions
|
48 |
-
with torch.no_grad():
|
49 |
-
# equivalent to model(input) when called out of class
|
50 |
-
out = model(words_idx).to(device)
|
51 |
-
out_predictions = out.to(device)
|
52 |
-
return vectoriser.decode(out_predictions)
|
53 |
-
|
54 |
-
|
55 |
-
if __name__ == "__main__":
|
56 |
-
### NEURAL NETWORK ###
|
57 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
58 |
-
print("Device check. You are using:", device)
|
59 |
-
|
60 |
-
### RÉSEAU ENTRAÎNÉ ###
|
61 |
-
# Pour s'assurer que les résultats seront les mêmes à chaque run du notebook
|
62 |
-
torch.use_deterministic_algorithms(True)
|
63 |
-
torch.manual_seed(0)
|
64 |
-
random.seed(0)
|
65 |
-
|
66 |
-
# On peut également entraîner encoder séparemment
|
67 |
-
encoder = Encoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device)
|
68 |
-
decoder = Decoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device)
|
69 |
-
# S'ils sont entraînés, on peut les sauvegarder
|
70 |
-
torch.save(encoder.state_dict(), "model/encoder.pt")
|
71 |
-
torch.save(encoder.state_dict(), "model/encoder.pt")
|
72 |
-
|
73 |
-
trained_classifier = EncoderDecoderModel(encoder, decoder, device).to(device)
|
74 |
-
|
75 |
-
print(next(trained_classifier.parameters()).device)
|
76 |
-
# print(train_dataset.is_cuda)
|
77 |
-
|
78 |
-
train_network(
|
79 |
-
trained_classifier,
|
80 |
-
[vectoriser.vectorize(row) for index, row in train_dataset.iterrows()],
|
81 |
-
[vectoriser.vectorize(row) for index, row in dev_dataset.iterrows()],
|
82 |
-
5,
|
83 |
-
)
|
84 |
-
|
85 |
-
torch.save(trained_classifier.state_dict(), "model/model.pt")
|
86 |
-
|
87 |
-
print(f'test text : {dev_dataset.iloc[6]["summary"]}')
|
88 |
-
print(
|
89 |
-
f'test prediction : {predict(trained_classifier, dev_dataset.iloc[6]["text"])}'
|
90 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/train.py
CHANGED
@@ -3,21 +3,19 @@ Training the network
|
|
3 |
"""
|
4 |
import datetime
|
5 |
import logging
|
|
|
6 |
import time
|
7 |
from typing import Sequence, Tuple
|
8 |
|
9 |
import torch
|
10 |
|
11 |
import dataloader
|
|
|
12 |
|
13 |
# logging INFO, WARNING, ERROR, CRITICAL, DEBUG
|
14 |
logging.basicConfig(level=logging.INFO)
|
15 |
logging.disable(level=10)
|
16 |
|
17 |
-
data1 = dataloader.Data("data/train_extract.jsonl")
|
18 |
-
words = data1.get_words()
|
19 |
-
vectoriser = dataloader.Vectoriser(words)
|
20 |
-
|
21 |
|
22 |
def train_network(
|
23 |
model: torch.nn.Module,
|
@@ -47,7 +45,6 @@ def train_network(
|
|
47 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
48 |
model = model.to(device)
|
49 |
print("Device check. You are using:", model.device)
|
50 |
-
model.train()
|
51 |
|
52 |
# with torch.no_grad():
|
53 |
|
@@ -81,10 +78,12 @@ def train_network(
|
|
81 |
|
82 |
out = model(source).to(device)
|
83 |
logging.debug(f"outputs = {out.shape}")
|
|
|
84 |
target = torch.nn.functional.pad(
|
85 |
target, (0, len(out) - len(target)), value=-100
|
86 |
)
|
87 |
-
|
|
|
88 |
loss = torch.nn.functional.nll_loss(out, target).to(device)
|
89 |
loss.backward()
|
90 |
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
|
@@ -131,3 +130,72 @@ def train_network(
|
|
131 |
print(
|
132 |
f"{epoch_n}\t{epoch_loss/epoch_length:.5}\t{abs(dev_correct/dev_total):.2%}\t\t{datetime.timedelta(seconds=epoch_compute_time)}"
|
133 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
"""
|
4 |
import datetime
|
5 |
import logging
|
6 |
+
import random
|
7 |
import time
|
8 |
from typing import Sequence, Tuple
|
9 |
|
10 |
import torch
|
11 |
|
12 |
import dataloader
|
13 |
+
from model import Decoder, Encoder, EncoderDecoderModel
|
14 |
|
15 |
# logging INFO, WARNING, ERROR, CRITICAL, DEBUG
|
16 |
logging.basicConfig(level=logging.INFO)
|
17 |
logging.disable(level=10)
|
18 |
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def train_network(
|
21 |
model: torch.nn.Module,
|
|
|
45 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
46 |
model = model.to(device)
|
47 |
print("Device check. You are using:", model.device)
|
|
|
48 |
|
49 |
# with torch.no_grad():
|
50 |
|
|
|
78 |
|
79 |
out = model(source).to(device)
|
80 |
logging.debug(f"outputs = {out.shape}")
|
81 |
+
|
82 |
target = torch.nn.functional.pad(
|
83 |
target, (0, len(out) - len(target)), value=-100
|
84 |
)
|
85 |
+
|
86 |
+
# logging.debug(f"prediction : {vectoriser.decode(output_predictions)}")
|
87 |
loss = torch.nn.functional.nll_loss(out, target).to(device)
|
88 |
loss.backward()
|
89 |
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
|
|
|
130 |
print(
|
131 |
f"{epoch_n}\t{epoch_loss/epoch_length:.5}\t{abs(dev_correct/dev_total):.2%}\t\t{datetime.timedelta(seconds=epoch_compute_time)}"
|
132 |
)
|
133 |
+
|
134 |
+
|
135 |
+
def predict(model, tokens: Sequence[str]) -> Sequence[str]:
|
136 |
+
"""Predict the POS for a tokenized sequence"""
|
137 |
+
words_idx = vectoriser.encode(tokens).to(device)
|
138 |
+
# Pas de calcul de gradient ici : c'est juste pour les prédictions
|
139 |
+
with torch.no_grad():
|
140 |
+
# equivalent to model(input) when called out of class
|
141 |
+
out = model(words_idx).to(device)
|
142 |
+
out_predictions = out.to(device)
|
143 |
+
print(out_predictions)
|
144 |
+
out_predictions = out_predictions.argmax(dim=-1)
|
145 |
+
return vectoriser.decode(out_predictions)
|
146 |
+
|
147 |
+
|
148 |
+
if __name__ == "__main__":
|
149 |
+
train_dataset = dataloader.Data("data/train_extract.jsonl")
|
150 |
+
words = train_dataset.get_words()
|
151 |
+
vectoriser = dataloader.Vectoriser(words)
|
152 |
+
|
153 |
+
train_dataset = dataloader.Data("data/train_extract.jsonl", transform=vectoriser)
|
154 |
+
dev_dataset = dataloader.Data("data/dev_extract.jsonl", transform=vectoriser)
|
155 |
+
|
156 |
+
train_dataloader = torch.utils.data.DataLoader(
|
157 |
+
train_dataset, batch_size=2, shuffle=True, collate_fn=dataloader.pad_collate
|
158 |
+
)
|
159 |
+
|
160 |
+
dev_dataloader = torch.utils.data.DataLoader(
|
161 |
+
dev_dataset, batch_size=4, shuffle=True, collate_fn=dataloader.pad_collate
|
162 |
+
)
|
163 |
+
|
164 |
+
for i_batch, batch in enumerate(train_dataloader):
|
165 |
+
print(i_batch, batch[0], batch[1])
|
166 |
+
|
167 |
+
### NEURAL NETWORK ###
|
168 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
169 |
+
print("Device check. You are using:", device)
|
170 |
+
|
171 |
+
### RÉSEAU ENTRAÎNÉ ###
|
172 |
+
# Pour s'assurer que les résultats seront les mêmes à chaque run du notebook
|
173 |
+
torch.use_deterministic_algorithms(True)
|
174 |
+
torch.manual_seed(0)
|
175 |
+
random.seed(0)
|
176 |
+
|
177 |
+
# On peut également entraîner encoder séparemment
|
178 |
+
encoder = Encoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device)
|
179 |
+
decoder = Decoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device)
|
180 |
+
|
181 |
+
trained_classifier = EncoderDecoderModel(encoder, decoder, vectoriser, device).to(
|
182 |
+
device
|
183 |
+
)
|
184 |
+
|
185 |
+
print(next(trained_classifier.parameters()).device)
|
186 |
+
# print(train_dataset.is_cuda)
|
187 |
+
|
188 |
+
train_network(
|
189 |
+
trained_classifier,
|
190 |
+
train_dataset,
|
191 |
+
dev_dataset,
|
192 |
+
2,
|
193 |
+
)
|
194 |
+
|
195 |
+
torch.save(trained_classifier.state_dict(), "model/model.pt")
|
196 |
+
vectoriser.save("model/vocab.pkl")
|
197 |
+
|
198 |
+
print(f"test summary : {vectoriser.decode(dev_dataset[6][1])}")
|
199 |
+
print(
|
200 |
+
f"test prediction : {predict(trained_classifier, vectoriser.decode(dev_dataset[6][0]))}"
|
201 |
+
)
|