File size: 6,838 Bytes
ad78747
 
 
 
 
3805a61
ad78747
 
 
 
 
 
3805a61
ad78747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3805a61
ad78747
 
 
3805a61
 
ad78747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3805a61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cbd001
3805a61
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""
Training the network
"""
import datetime
import logging
import random
import time
from typing import Sequence, Tuple

import torch

import dataloader
from model import Decoder, Encoder, EncoderDecoderModel

# logging INFO, WARNING, ERROR, CRITICAL, DEBUG
logging.basicConfig(level=logging.INFO)
logging.disable(level=10)


def train_network(
    model: torch.nn.Module,
    train_set: Sequence[Tuple[torch.tensor, torch.Tensor]],
    dev_set: Sequence[Tuple[torch.tensor, torch.Tensor]],
    epochs: int,
    clip: int = 1,
):
    """
    Train the EncoderDecoderModel network for a given number of epoch
    -----------
    Parameters
        model: torch.nn.Module
            EncoderDecoderModel defined in model.py
        train_set: Sequence[Tuple[torch.tensor, torch.tensor]]
            tuple of vectorized (text, summary) from the training set
        dev_set: Sequence[Tuple[torch.tensor, torch.tensor]]
            tuple of vectorized (text, summary) for the dev set
        epochs: int
            the number of epochs to train on
        clip: int
            no idea
    Return
        None
    """

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    print("Device check. You are using:", model.device)

    # with torch.no_grad():

    optim = torch.optim.Adam(model.parameters(), lr=0.01)

    print("Epoch\ttrain loss\tdev accuracy\tcompute time")

    for epoch_n in range(epochs):
        # Tell the model it's in train mode for layers designed to
        # behave differently in train or evaluation
        # https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch
        model.train()

        # To get the computing time per epoch
        epoch_start_time = time.time()

        # To get the model accuracy per epoch
        epoch_loss = 0.0
        epoch_length = 0

        # Iterates over all the text, summary tuples
        for source, target in train_set:
            source = source.to(device)
            target = target.to(device)

            # DEBUG Block
            # logging.debug("TRAIN")
            # logging.debug(f"cuda available ? {torch.cuda.is_available()}")
            # logging.debug(f"Source sur cuda ? {source.is_cuda}")
            # logging.debug(f"Target sur cuda ? {target.is_cuda}")

            out = model(source).to(device)
            logging.debug(f"outputs = {out.shape}")

            target = torch.nn.functional.pad(
                target, (0, len(out) - len(target)), value=-100
            )

            # logging.debug(f"prediction : {vectoriser.decode(output_predictions)}")
            loss = torch.nn.functional.nll_loss(out, target).to(device)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optim.step()

            epoch_loss += loss.item()
            epoch_length += source.shape[0]

        # To check the model accuracy on new data
        dev_correct = 0
        dev_total = 0

        # Iterates over text, summary tuple from dev
        for source, target in dev_set:
            # We here want to evaluate the model
            # so we're switching to evaluation mode
            model.eval()

            source = source.to(device)
            target = target.to(device)

            # We compute the result
            output = model(source).to(device)

            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            logging.debug(f"dev output : {output.shape}")
            target = target[1:].view(-1)
            # To compare the output with the target,
            # they have to be of same length so we're
            # padding the target with -100 idx that will
            # be ignored by the nll_loss function
            target = torch.nn.functional.pad(
                target, (0, len(output) - len(target)), value=-100
            )
            dev_loss = torch.nn.functional.nll_loss(output, target)
            dev_correct += dev_loss.item()
            dev_total += source.shape[0]

        # Compute of the epoch training time
        epoch_compute_time = time.time() - epoch_start_time

        print(
            f"{epoch_n}\t{epoch_loss/epoch_length:.5}\t{abs(dev_correct/dev_total):.2%}\t\t{datetime.timedelta(seconds=epoch_compute_time)}"
        )


def predict(model, tokens: Sequence[str]) -> Sequence[str]:
    """Predict the POS for a tokenized sequence"""
    words_idx = vectoriser.encode(tokens).to(device)
    # Pas de calcul de gradient ici : c'est juste pour les prédictions
    with torch.no_grad():
        # equivalent to model(input) when called out of class
        out = model(words_idx).to(device)
    out_predictions = out.to(device)
    print(out_predictions)
    out_predictions = out_predictions.argmax(dim=-1)
    return vectoriser.decode(out_predictions)


if __name__ == "__main__":
    train_dataset = dataloader.Data("data/train_extract.jsonl")
    words = train_dataset.get_words()
    vectoriser = dataloader.Vectoriser(words)

    train_dataset = dataloader.Data("data/train_extract.jsonl", transform=vectoriser)
    dev_dataset = dataloader.Data("data/dev_extract.jsonl", transform=vectoriser)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=2, shuffle=True, collate_fn=dataloader.pad_collate
    )

    dev_dataloader = torch.utils.data.DataLoader(
        dev_dataset, batch_size=4, shuffle=True, collate_fn=dataloader.pad_collate
    )

    for i_batch, batch in enumerate(train_dataloader):
        print(i_batch, batch[0], batch[1])

    ### NEURAL NETWORK ###
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device check. You are using:", device)

    ### RÉSEAU ENTRAÎNÉ ###
    # Pour s'assurer que les résultats seront les mêmes à chaque run du notebook
    torch.use_deterministic_algorithms(True)
    torch.manual_seed(0)
    random.seed(0)

    # On peut également entraîner encoder séparemment
    encoder = Encoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device)
    decoder = Decoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device)

    trained_classifier = EncoderDecoderModel(encoder, decoder, vectoriser, device).to(
        device
    )

    print(next(trained_classifier.parameters()).device)
    # print(train_dataset.is_cuda)

    train_network(
        trained_classifier,
        train_dataset,
        dev_dataset,
        2,
    )

    torch.save(trained_classifier.state_dict(), "model/model.pt")
    vectoriser.save("model/vocab.pkl")
    trained_classifier.config.to_json_file("config.json")

    print(f"test summary : {vectoriser.decode(dev_dataset[6][1])}")
    print(
        f"test prediction : {predict(trained_classifier, vectoriser.decode(dev_dataset[6][0]))}"
    )