Update PrateritumGPT.py
Browse files- PrateritumGPT.py +77 -18
PrateritumGPT.py
CHANGED
@@ -4,6 +4,15 @@ import torch.nn as nn
|
|
4 |
from torch.utils.data import Dataset, DataLoader
|
5 |
from torch.nn.utils.rnn import pad_sequence
|
6 |
import math
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
tokens = list("azertyuiopqsdfghjklmwxcvbnäüöß—– ")
|
9 |
tokensdict = {}
|
@@ -31,17 +40,18 @@ class CSVDataset(Dataset):
|
|
31 |
# Supposons que vous ayez vos données sous forme de listes
|
32 |
features = []
|
33 |
labels = []
|
|
|
34 |
|
35 |
for i in reader:
|
36 |
k = []
|
37 |
for j in i[2]:
|
38 |
-
k += [tokens.index(j)
|
39 |
-
k += [
|
40 |
features += [torch.Tensor(k)]
|
41 |
k = []
|
42 |
for j in i[8]:
|
43 |
-
k += [tokens.index(j)
|
44 |
-
k += [
|
45 |
labels += [torch.Tensor(k)]
|
46 |
|
47 |
MyDataset = CSVDataset(features=features, labels=labels)
|
@@ -49,17 +59,18 @@ MyDataset = CSVDataset(features=features, labels=labels)
|
|
49 |
class TransformerModel(nn.Module):
|
50 |
def __init__(self, vocab_size, emb_dim, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout=0.1):
|
51 |
super().__init__()
|
52 |
-
self.custom_embedding = nn.Embedding(vocab_size, emb_dim).to(
|
53 |
-
self.pos_encoder = PositionalEncoding(emb_dim, dropout).to(
|
54 |
-
encoder_layer = nn.TransformerEncoderLayer(emb_dim, nhead, dim_feedforward, dropout, batch_first=True).to(
|
55 |
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers)
|
56 |
-
decoder_layer = nn.TransformerDecoderLayer(emb_dim, nhead, dim_feedforward, dropout, batch_first=True).to(
|
57 |
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers)
|
58 |
-
self.output_layer = nn.Linear(emb_dim, vocab_size).to(
|
59 |
|
60 |
def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
|
|
|
|
|
61 |
src_emb = self.custom_embedding(src.long())
|
62 |
-
#print("Source Embedding:", src_emb.shape)
|
63 |
src_emb = self.pos_encoder(src_emb)
|
64 |
#print("Source Embedding:", src_emb.shape)
|
65 |
tgt_emb = self.custom_embedding(tgt.long())
|
@@ -69,6 +80,7 @@ class TransformerModel(nn.Module):
|
|
69 |
encoder_output = self.transformer_encoder(src_emb, src_mask, src_key_padding_mask)
|
70 |
decoder_output = self.transformer_decoder(tgt_emb, encoder_output, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask)
|
71 |
output = self.output_layer(decoder_output[:, -1, :])
|
|
|
72 |
return output
|
73 |
|
74 |
class PositionalEncoding(nn.Module):
|
@@ -89,24 +101,63 @@ class PositionalEncoding(nn.Module):
|
|
89 |
return self.dropout(x)
|
90 |
|
91 |
def collate_fn(batch):
|
92 |
-
inputs = [item[0].to(
|
93 |
-
targets = [item[1].to(
|
94 |
-
inputs = pad_sequence(inputs, batch_first=True, padding_value=
|
95 |
-
targets = pad_sequence(targets, batch_first=True, padding_value=
|
96 |
return inputs, targets
|
97 |
|
98 |
train_loader = DataLoader(MyDataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
|
99 |
|
100 |
-
model = TransformerModel(vocab_size=len(tokens)+1, emb_dim=
|
101 |
loss_fn = nn.CrossEntropyLoss()
|
102 |
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
103 |
|
104 |
-
epochs =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
for epoch in range(epochs):
|
107 |
total_loss = 0.0
|
108 |
|
|
|
|
|
|
|
|
|
109 |
for batch_idx, (inputs, targets) in enumerate(train_loader):
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
for i in range(1, targets.shape[1]):
|
111 |
optimizer.zero_grad()
|
112 |
output = model(inputs, targets[:, :i]) # Shifted targets
|
@@ -117,8 +168,16 @@ for epoch in range(epochs):
|
|
117 |
|
118 |
total_loss += loss.item()
|
119 |
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")
|
123 |
|
124 |
-
torch.save(model, "data/PrateritumGPT.pth")
|
|
|
4 |
from torch.utils.data import Dataset, DataLoader
|
5 |
from torch.nn.utils.rnn import pad_sequence
|
6 |
import math
|
7 |
+
import progressbar
|
8 |
+
|
9 |
+
device="cpu"
|
10 |
+
|
11 |
+
def CreateBar():
|
12 |
+
global bar
|
13 |
+
bar = progressbar.ProgressBar(maxval=100, \
|
14 |
+
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
|
15 |
+
bar.start()
|
16 |
|
17 |
tokens = list("azertyuiopqsdfghjklmwxcvbnäüöß—– ")
|
18 |
tokensdict = {}
|
|
|
40 |
# Supposons que vous ayez vos données sous forme de listes
|
41 |
features = []
|
42 |
labels = []
|
43 |
+
padding=len(tokens)
|
44 |
|
45 |
for i in reader:
|
46 |
k = []
|
47 |
for j in i[2]:
|
48 |
+
k += [tokens.index(j)]
|
49 |
+
#k += [-1] * (25 - len(k))
|
50 |
features += [torch.Tensor(k)]
|
51 |
k = []
|
52 |
for j in i[8]:
|
53 |
+
k += [tokens.index(j)]
|
54 |
+
#k += [-1] * (25 - len(k))
|
55 |
labels += [torch.Tensor(k)]
|
56 |
|
57 |
MyDataset = CSVDataset(features=features, labels=labels)
|
|
|
59 |
class TransformerModel(nn.Module):
|
60 |
def __init__(self, vocab_size, emb_dim, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout=0.1):
|
61 |
super().__init__()
|
62 |
+
self.custom_embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=padding).to(device)
|
63 |
+
self.pos_encoder = PositionalEncoding(emb_dim, dropout).to(device)
|
64 |
+
encoder_layer = nn.TransformerEncoderLayer(emb_dim, nhead, dim_feedforward, dropout, batch_first=True).to(device)
|
65 |
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers)
|
66 |
+
decoder_layer = nn.TransformerDecoderLayer(emb_dim, nhead, dim_feedforward, dropout, batch_first=True).to(device)
|
67 |
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers)
|
68 |
+
self.output_layer = nn.Linear(emb_dim, vocab_size).to(device)
|
69 |
|
70 |
def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
|
71 |
+
#print("Source:", src)
|
72 |
+
#print("Target:", tgt)
|
73 |
src_emb = self.custom_embedding(src.long())
|
|
|
74 |
src_emb = self.pos_encoder(src_emb)
|
75 |
#print("Source Embedding:", src_emb.shape)
|
76 |
tgt_emb = self.custom_embedding(tgt.long())
|
|
|
80 |
encoder_output = self.transformer_encoder(src_emb, src_mask, src_key_padding_mask)
|
81 |
decoder_output = self.transformer_decoder(tgt_emb, encoder_output, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask)
|
82 |
output = self.output_layer(decoder_output[:, -1, :])
|
83 |
+
#print("Output:",output.shape)
|
84 |
return output
|
85 |
|
86 |
class PositionalEncoding(nn.Module):
|
|
|
101 |
return self.dropout(x)
|
102 |
|
103 |
def collate_fn(batch):
|
104 |
+
inputs = [item[0].to(device) for item in batch]
|
105 |
+
targets = [item[1].to(device) for item in batch]
|
106 |
+
inputs = pad_sequence(inputs, batch_first=True, padding_value=padding)
|
107 |
+
targets = pad_sequence(targets, batch_first=True, padding_value=padding)
|
108 |
return inputs, targets
|
109 |
|
110 |
train_loader = DataLoader(MyDataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
|
111 |
|
112 |
+
model = TransformerModel(vocab_size=len(tokens)+1, emb_dim=16, nhead=4, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=256)
|
113 |
loss_fn = nn.CrossEntropyLoss()
|
114 |
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
115 |
|
116 |
+
epochs = 100
|
117 |
+
|
118 |
+
try:
|
119 |
+
model.load_state_dict(torch.load("data/PrateritumGPT.pth"))
|
120 |
+
print("Sucessfully loaded model.")
|
121 |
+
except:
|
122 |
+
pass
|
123 |
+
|
124 |
+
#print(model(torch.zeros((1,25)).to(device),torch.zeros((1,25)).to(device)))
|
125 |
+
inp=input("Which verb? ")
|
126 |
+
src=[[]]
|
127 |
+
tgt=[[tokens.index(inp[0])]]
|
128 |
+
for i in inp:
|
129 |
+
src[0]+=[tokens.index(i)]
|
130 |
+
str_=inp[0]
|
131 |
+
for i in range(100):
|
132 |
+
out=model(torch.Tensor(src).to(device),torch.Tensor(tgt).to(device)).tolist()[0]
|
133 |
+
Best=0
|
134 |
+
Best_=tokens.index(" ")
|
135 |
+
for k,f in enumerate(out):
|
136 |
+
if f>Best:
|
137 |
+
Best=f
|
138 |
+
Best_=k
|
139 |
+
if Best_==len(tokens):
|
140 |
+
break
|
141 |
+
str_+=tokens[Best_]
|
142 |
+
tgt[0]+=[Best_]
|
143 |
+
|
144 |
+
print(str_)
|
145 |
+
|
146 |
|
147 |
for epoch in range(epochs):
|
148 |
total_loss = 0.0
|
149 |
|
150 |
+
CreateBar()
|
151 |
+
|
152 |
+
bar.start()
|
153 |
+
|
154 |
for batch_idx, (inputs, targets) in enumerate(train_loader):
|
155 |
+
|
156 |
+
#print("",inputs,targets)
|
157 |
+
|
158 |
+
targets.to(device)
|
159 |
+
inputs.to(device)
|
160 |
+
|
161 |
for i in range(1, targets.shape[1]):
|
162 |
optimizer.zero_grad()
|
163 |
output = model(inputs, targets[:, :i]) # Shifted targets
|
|
|
168 |
|
169 |
total_loss += loss.item()
|
170 |
|
171 |
+
mask = targets[:, i] != len(tokens)
|
172 |
+
targets = targets[mask]
|
173 |
+
inputs = inputs[mask]
|
174 |
+
|
175 |
+
bar.update((batch_idx+1)/len(train_loader)*100)
|
176 |
+
|
177 |
+
#print(f"Epoch {epoch + 1}/{epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {total_loss / (batch_idx + 1)}")
|
178 |
+
|
179 |
+
bar.finish()
|
180 |
|
181 |
print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")
|
182 |
|
183 |
+
torch.save(model.state_dict(), "data/PrateritumGPT.pth")
|