Spaces:

EveSa
/

SummaryProject

Runtime error

App Files Files Community

EveSa commited on Mar 9, 2023

Commit

3805a61

•

1 Parent(s): 3e4df8e

Mise en place une classe dataset fonctionnelle

Browse files

Files changed (5) hide show

src/dataloader.py +56 -37
src/inference.py +8 -7
src/model.py +40 -38
src/script.py +0 -90
src/train.py +74 -6

src/dataloader.py CHANGED Viewed

@@ -11,17 +11,15 @@
     Création d'un Vectoriserà partir du vocabulaire :
 """
 import string
 from collections import Counter
 import pandas as pd
 import torch
-from nltk import word_tokenize
-# nltk.download('punkt')
-class Data:
     """
     A class used to get data from file
     ...
@@ -44,8 +42,27 @@ class Data:
         create a dataset with cleaned data
     """
-    def __init__(self, path: str) -> None:
         self.path = path
     def open(self) -> pd.DataFrame:
         """
@@ -85,26 +102,13 @@ class Data:
         # - s'occuper des noms propres (mots commençant par une majuscule qui se suivent)
         for text in texts:
             text = text.translate(str.maketrans("", "", string.punctuation))
-            text = word_tokenize(text)
             tokenized_texts.append(text)
         if text_type == "summary":
             return [["<start>", *summary, "<end>"] for summary in tokenized_texts]
         return tokenized_texts
-    def pad_sequence(self):
-        """
-        pad summary with empty token
-        """
-        texts = self.clean_data("text")
-        summaries = self.clean_data("summary")
-        padded_summary = []
-        for text, summary in zip(texts, summaries):
-            if len(summary) != len(text):
-                summary += ["<empty>"] * (len(text) - len(summary))
-            padded_summary.append(summary)
-        return texts, padded_summary
     def get_words(self) -> list:
         """
         Create a dictionnary of the data vocabulary
@@ -114,15 +118,20 @@ class Data:
         summary_words = [word for text in summaries for word in text]
         return text_words + summary_words
-    def make_dataset(self) -> pd.DataFrame:
-        """
-        Create a Pandas Dataframe with cleaned data
-        --------------------
-        param: self: Data
-        return: pd.DataFrame
-        """
-        texts, summaries = self.clean_data("text"), self.clean_data("summary")
-        return pd.DataFrame(list(zip(texts, summaries)), columns=["text", "summary"])
 class Vectoriser:
@@ -146,12 +155,25 @@ class Vectoriser:
         encode an entire row from the dataset
     """
-    def __init__(self, vocab) -> None:
         self.vocab = vocab
         self.word_count = Counter(word.lower().strip(",.\\-") for word in self.vocab)
         self.idx_to_token = sorted([t for t, c in self.word_count.items() if c > 1])
         self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}
     def encode(self, tokens) -> torch.tensor:
         """
         Encode une phrase selon les mots qu'elle contient
@@ -165,7 +187,7 @@ class Vectoriser:
         :return: words_idx : tensor
             Un tensor contenant les index des mots de la phrase
         """
-        if type(tokens) == list:
             words_idx = torch.tensor(
                 [
                     self.token_to_idx.get(t.lower(), len(self.token_to_idx))
@@ -175,7 +197,7 @@ class Vectoriser:
             )
         # Permet d'encoder mots par mots
-        elif type(tokens) == str:
             words_idx = torch.tensor(self.token_to_idx.get(tokens.lower()))
         return words_idx
@@ -184,9 +206,9 @@ class Vectoriser:
         """
         Decode une phrase selon le procédé inverse que la fonction encode
         """
-        words_idx_tensor = words_idx_tensor.argmax(dim=-1)
         idxs = words_idx_tensor.tolist()
-        if type(idxs) == int:
             words = [self.idx_to_token[idxs]]
         else:
             words = []
@@ -195,10 +217,7 @@ class Vectoriser:
                     words.append(self.idx_to_token[idx])
         return words
-    def beam_search(self, words_idx_tensor) -> list:
-        pass
-    def vectorize(self, row) -> torch.tensor:
         """
         Encode les données d'une ligne du dataframe
         ----------

     Création d'un Vectoriserà partir du vocabulaire :
 """
+import pickle
 import string
 from collections import Counter
 import pandas as pd
 import torch
+class Data(torch.utils.data.Dataset):
     """
     A class used to get data from file
     ...
         create a dataset with cleaned data
     """
+    def __init__(self, path: str, transform=None) -> None:
         self.path = path
+        self.data = pd.read_json(path_or_buf=self.path, lines=True)
+        self.transform = transform
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        row = self.data.iloc[idx]
+        text = row["text"].translate(str.maketrans("", "", string.punctuation)).split()
+        summary = (
+            row["summary"].translate(str.maketrans("", "", string.punctuation)).split()
+        )
+        summary = ["<start>", *summary, "<end>"]
+        sample = {"text": text, "summary": summary}
+        if self.transform:
+            sample = self.transform(sample)
+        return sample
     def open(self) -> pd.DataFrame:
         """
         # - s'occuper des noms propres (mots commençant par une majuscule qui se suivent)
         for text in texts:
             text = text.translate(str.maketrans("", "", string.punctuation))
+            text = text.split()
             tokenized_texts.append(text)
         if text_type == "summary":
             return [["<start>", *summary, "<end>"] for summary in tokenized_texts]
         return tokenized_texts
     def get_words(self) -> list:
         """
         Create a dictionnary of the data vocabulary
         summary_words = [word for text in summaries for word in text]
         return text_words + summary_words
+def pad_collate(data):
+    text_batch = [element[0] for element in data]
+    summary_batch = [element[1] for element in data]
+    max_len = max([len(element) for element in summary_batch + text_batch])
+    text_batch = [
+        torch.nn.functional.pad(element, (0, max_len - len(element)), value=-100)
+        for element in text_batch
+    ]
+    summary_batch = [
+        torch.nn.functional.pad(element, (0, max_len - len(element)), value=-100)
+        for element in summary_batch
+    ]
+    return text_batch, summary_batch
 class Vectoriser:
         encode an entire row from the dataset
     """
+    def __init__(self, vocab=None) -> None:
         self.vocab = vocab
         self.word_count = Counter(word.lower().strip(",.\\-") for word in self.vocab)
         self.idx_to_token = sorted([t for t, c in self.word_count.items() if c > 1])
         self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}
+    def load(self, path):
+        with open(path, "rb") as file:
+            self.vocab = pickle.load(file)
+            self.word_count = Counter(
+                word.lower().strip(",.\\-") for word in self.vocab
+            )
+            self.idx_to_token = sorted([t for t, c in self.word_count.items() if c > 1])
+            self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}
+    def save(self, path):
+        with open(path, "wb") as file:
+            pickle.dump(self.vocab, file)
     def encode(self, tokens) -> torch.tensor:
         """
         Encode une phrase selon les mots qu'elle contient
         :return: words_idx : tensor
             Un tensor contenant les index des mots de la phrase
         """
+        if isinstance(tokens, list):
             words_idx = torch.tensor(
                 [
                     self.token_to_idx.get(t.lower(), len(self.token_to_idx))
             )
         # Permet d'encoder mots par mots
+        elif isinstance(tokens, str):
             words_idx = torch.tensor(self.token_to_idx.get(tokens.lower()))
         return words_idx
         """
         Decode une phrase selon le procédé inverse que la fonction encode
         """
         idxs = words_idx_tensor.tolist()
+        if isinstance(idxs, int):
             words = [self.idx_to_token[idxs]]
         else:
             words = []
                     words.append(self.idx_to_token[idx])
         return words
+    def __call__(self, row) -> torch.tensor:
         """
         Encode les données d'une ligne du dataframe
         ----------

src/inference.py CHANGED Viewed

@@ -1,21 +1,22 @@
 """
  Allows to predict the summary for a given entry text
 """
 import torch
-from nltk import word_tokenize
-from src import dataloader
-from src.model import Decoder, Encoder, EncoderDecoderModel
 # On doit loader les données pour avoir le Vectoriser > sauvegarder "words" dans un fichiers et le loader par la suite ??
 ### À CHANGER POUR N'AVOIR À LOADER QUE LE VECTORISER
 data1 = dataloader.Data("data/train_extract.jsonl")
 data2 = dataloader.Data("data/dev_extract.jsonl")
-train_dataset = data1.make_dataset()
-dev_dataset = data2.make_dataset()
 words = data1.get_words()
-vectoriser = dataloader.Vectoriser(words)
 word_counts = vectoriser.word_count
@@ -30,7 +31,7 @@ def inferenceAPI(text: str) -> str:
         str
             The summary for the input text
     """
-    text = word_tokenize(text)
     # On défini les paramètres d'entrée pour le modèle
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     encoder = Encoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device).to(

 """
  Allows to predict the summary for a given entry text
 """
+import pickle
 import torch
+import dataloader
+from model import Decoder, Encoder, EncoderDecoderModel
 # On doit loader les données pour avoir le Vectoriser > sauvegarder "words" dans un fichiers et le loader par la suite ??
 ### À CHANGER POUR N'AVOIR À LOADER QUE LE VECTORISER
 data1 = dataloader.Data("data/train_extract.jsonl")
 data2 = dataloader.Data("data/dev_extract.jsonl")
+words = pickle.load("model/vocab.pkl")
 words = data1.get_words()
+vectoriser = dataloader.Vectoriser()
+vectoriser.load("model/vocab.pkl")
 word_counts = vectoriser.word_count
         str
             The summary for the input text
     """
+    text = text.split()
     # On défini les paramètres d'entrée pour le modèle
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     encoder = Encoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device).to(

src/model.py CHANGED Viewed

@@ -6,14 +6,8 @@ import logging
 import torch
-from src import dataloader
 logging.basicConfig(level=logging.DEBUG)
-data1 = dataloader.Data("data_extract/train_extract.jsonl")
-words = data1.get_words()
-vectoriser = dataloader.Vectoriser(words)
 class Encoder(torch.nn.Module):
     def __init__(
@@ -86,51 +80,59 @@ class Decoder(torch.nn.Module):
 class EncoderDecoderModel(torch.nn.Module):
-    def __init__(self, encoder, decoder, device):
         # Une idiosyncrasie de torch, pour qu'iel puisse faire sa magie
         super().__init__()
         self.encoder = encoder
         self.decoder = decoder
         self.device = device
-    def forward(self, source, num_beams=3):
-        # CHANGER LA TARGET LEN POUR QQCH DE MODULABLE
-        target_len = int(1 * source.shape[0])  # Taille du texte que l'on recherche
-        target_vocab_size = self.decoder.vocab_size  # Taille du mot
-        # tensor to store decoder outputs
-        outputs = torch.zeros(target_len, target_vocab_size).to(
-            self.device
-        )  # Instenciation d'une matrice de zeros de taille (taille du texte, taille du mot)
-        outputs.to(
-            self.device
-        )  # Une idiosyncrasie de torch pour mettre le tensor sur le GPU
         # last hidden state of the encoder is used as the initial hidden state of the decoder
-        source.to(
-            self.device
-        )  # Une idiosyncrasie de torch pour mettre le tensor sur le GPU
-        hidden, cell = self.encoder(source)  # Encode le texte sous forme de vecteur
-        hidden.to(
-            self.device
-        )  # Une idiosyncrasie de torch pour mettre le tensor sur le GPU
-        cell.to(
-            self.device
-        )  # Une idiosyncrasie de torch pour mettre le tensor sur le GPU
-        # first input to the decoder is the <start> token.
-        input = vectoriser.encode("<start>")  # Mot de départ du MOdèle
-        input.to(self.device)  # idiosyncrasie de torch pour mmettre sur GPU
-        ### DÉBUT DE L'INSTANCIATION TEST ###
         # If you wonder, b stands for better
         values = None
         b_outputs = torch.zeros(target_len, target_vocab_size).to(self.device)
         b_outputs.to(self.device)
-        for i in range(
-            1, target_len
-        ):  # On va déterminer autant de mot que la taille du texte souhaité
             # insert input token embedding, previous hidden and previous cell states
             # receive output tensor (predictions) and new hidden and cell states.

 import torch
 logging.basicConfig(level=logging.DEBUG)
 class Encoder(torch.nn.Module):
     def __init__(
 class EncoderDecoderModel(torch.nn.Module):
+    def __init__(self, encoder, decoder, vectoriser, device):
         # Une idiosyncrasie de torch, pour qu'iel puisse faire sa magie
         super().__init__()
         self.encoder = encoder
         self.decoder = decoder
+        self.vectoriser = vectoriser
         self.device = device
+    def forward(self, source, num_beams=3, summary_len=0.2):
+        """
+        :param source: tensor
+            the input text
+        :param num_beams: int
+            the number of outputs to iterate on for beam_search
+        :param summary_len: int
+            length ratio of the summary compared to the text
+        """
+        # The ratio must be inferior to 1 to allow text compression
+        assert summary_len < 1, f"number lesser than 1 expected, got {summary_len}"
+        target_len = int(
+            summary_len * source.shape[0]
+        )  # Expected summary length (in words)
+        target_vocab_size = self.decoder.vocab_size  # Word Embedding length
+        # Output of the right format (expected summmary length x word embedding length)
+        # filled with zeros. On each iteration, we will replace one of the row of this
+        # matrix with the choosen word embedding
+        outputs = torch.zeros(target_len, target_vocab_size)
+        # put the tensors on the device (useless if CPU bus very useful in case of GPU)
+        outputs.to(self.device)
+        source.to(self.device)
         # last hidden state of the encoder is used as the initial hidden state of the decoder
+        hidden, cell = self.encoder(source)  # Encode the input text
+        input = self.vectoriser.encode(
+            "<start>"
+        )  # Encode the first word of the summary
+        # put the tensors on the device
+        hidden.to(self.device)
+        cell.to(self.device)
+        input.to(self.device)
+        ### BEAM SEARCH ###
         # If you wonder, b stands for better
         values = None
         b_outputs = torch.zeros(target_len, target_vocab_size).to(self.device)
         b_outputs.to(self.device)
+        for i in range(1, target_len):
+            # On va déterminer autant de mot que la taille du texte souhaité
             # insert input token embedding, previous hidden and previous cell states
             # receive output tensor (predictions) and new hidden and cell states.

src/script.py DELETED Viewed

@@ -1,90 +0,0 @@
-"""
-    DONE :
-    - Separer la partie vectoriser du Classifeur
-    - Ajouter un LSTM au Classifieur
-    - entrainer le Classifieur
-    TO DO :
-    - Améliorer les résultats du modèle
-"""
-import logging
-import random
-from typing import Sequence
-import torch
-import dataloader
-from model import Decoder, Encoder, EncoderDecoderModel
-from train import train_network
-# logging INFO, WARNING, ERROR, CRITICAL, DEBUG
-logging.basicConfig(level=logging.INFO)
-logging.disable(level=10)
-import os
-os.environ[
-    "CUBLAS_WORKSPACE_CONFIG"
-] = ":16:8"  # pour que ça marche en deterministe sur mon pc boulot
-# variable environnement dans git bash export CUBLAS_WORKSPACE_CONFIG=:16:8
-# from datasets import load_dataset
-### OPEN DATASET###
-# dataset = load_dataset("newsroom", data_dir=DATA_PATH, data_files="data/train.jsonl")
-data1 = dataloader.Data("data/train_extract.jsonl")
-data2 = dataloader.Data("data/dev_extract.jsonl")
-train_dataset = data1.make_dataset()
-dev_dataset = data2.make_dataset()
-words = data1.get_words()
-vectoriser = dataloader.Vectoriser(words)
-word_counts = vectoriser.word_count
-def predict(model, tokens: Sequence[str]) -> Sequence[str]:
-    """Predict the POS for a tokenized sequence"""
-    words_idx = vectoriser.encode(tokens).to(device)
-    # Pas de calcul de gradient ici : c'est juste pour les prédictions
-    with torch.no_grad():
-        # equivalent to model(input) when called out of class
-        out = model(words_idx).to(device)
-    out_predictions = out.to(device)
-    return vectoriser.decode(out_predictions)
-if __name__ == "__main__":
-    ### NEURAL NETWORK ###
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print("Device check. You are using:", device)
-    ### RÉSEAU ENTRAÎNÉ ###
-    # Pour s'assurer que les résultats seront les mêmes à chaque run du notebook
-    torch.use_deterministic_algorithms(True)
-    torch.manual_seed(0)
-    random.seed(0)
-    # On peut également entraîner encoder séparemment
-    encoder = Encoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device)
-    decoder = Decoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device)
-    # S'ils sont entraînés, on peut les sauvegarder
-    torch.save(encoder.state_dict(), "model/encoder.pt")
-    torch.save(encoder.state_dict(), "model/encoder.pt")
-    trained_classifier = EncoderDecoderModel(encoder, decoder, device).to(device)
-    print(next(trained_classifier.parameters()).device)
-    # print(train_dataset.is_cuda)
-    train_network(
-        trained_classifier,
-        [vectoriser.vectorize(row) for index, row in train_dataset.iterrows()],
-        [vectoriser.vectorize(row) for index, row in dev_dataset.iterrows()],
-        5,
-    )
-    torch.save(trained_classifier.state_dict(), "model/model.pt")
-    print(f'test text : {dev_dataset.iloc[6]["summary"]}')
-    print(
-        f'test prediction : {predict(trained_classifier, dev_dataset.iloc[6]["text"])}'
-    )

src/train.py CHANGED Viewed

@@ -3,21 +3,19 @@ Training the network
 """
 import datetime
 import logging
 import time
 from typing import Sequence, Tuple
 import torch
 import dataloader
 # logging INFO, WARNING, ERROR, CRITICAL, DEBUG
 logging.basicConfig(level=logging.INFO)
 logging.disable(level=10)
-data1 = dataloader.Data("data/train_extract.jsonl")
-words = data1.get_words()
-vectoriser = dataloader.Vectoriser(words)
 def train_network(
     model: torch.nn.Module,
@@ -47,7 +45,6 @@ def train_network(
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model = model.to(device)
     print("Device check. You are using:", model.device)
-    model.train()
     # with torch.no_grad():
@@ -81,10 +78,12 @@ def train_network(
             out = model(source).to(device)
             logging.debug(f"outputs = {out.shape}")
             target = torch.nn.functional.pad(
                 target, (0, len(out) - len(target)), value=-100
             )
-            # logging.debug(f"predition : {vectoriser.decode(output_predictions)}")
             loss = torch.nn.functional.nll_loss(out, target).to(device)
             loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
@@ -131,3 +130,72 @@ def train_network(
         print(
             f"{epoch_n}\t{epoch_loss/epoch_length:.5}\t{abs(dev_correct/dev_total):.2%}\t\t{datetime.timedelta(seconds=epoch_compute_time)}"
         )

 """
 import datetime
 import logging
+import random
 import time
 from typing import Sequence, Tuple
 import torch
 import dataloader
+from model import Decoder, Encoder, EncoderDecoderModel
 # logging INFO, WARNING, ERROR, CRITICAL, DEBUG
 logging.basicConfig(level=logging.INFO)
 logging.disable(level=10)
 def train_network(
     model: torch.nn.Module,
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model = model.to(device)
     print("Device check. You are using:", model.device)
     # with torch.no_grad():
             out = model(source).to(device)
             logging.debug(f"outputs = {out.shape}")
             target = torch.nn.functional.pad(
                 target, (0, len(out) - len(target)), value=-100
             )
+            # logging.debug(f"prediction : {vectoriser.decode(output_predictions)}")
             loss = torch.nn.functional.nll_loss(out, target).to(device)
             loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
         print(
             f"{epoch_n}\t{epoch_loss/epoch_length:.5}\t{abs(dev_correct/dev_total):.2%}\t\t{datetime.timedelta(seconds=epoch_compute_time)}"
         )
+def predict(model, tokens: Sequence[str]) -> Sequence[str]:
+    """Predict the POS for a tokenized sequence"""
+    words_idx = vectoriser.encode(tokens).to(device)
+    # Pas de calcul de gradient ici : c'est juste pour les prédictions
+    with torch.no_grad():
+        # equivalent to model(input) when called out of class
+        out = model(words_idx).to(device)
+    out_predictions = out.to(device)
+    print(out_predictions)
+    out_predictions = out_predictions.argmax(dim=-1)
+    return vectoriser.decode(out_predictions)
+if __name__ == "__main__":
+    train_dataset = dataloader.Data("data/train_extract.jsonl")
+    words = train_dataset.get_words()
+    vectoriser = dataloader.Vectoriser(words)
+    train_dataset = dataloader.Data("data/train_extract.jsonl", transform=vectoriser)
+    dev_dataset = dataloader.Data("data/dev_extract.jsonl", transform=vectoriser)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=2, shuffle=True, collate_fn=dataloader.pad_collate
+    )
+    dev_dataloader = torch.utils.data.DataLoader(
+        dev_dataset, batch_size=4, shuffle=True, collate_fn=dataloader.pad_collate
+    )
+    for i_batch, batch in enumerate(train_dataloader):
+        print(i_batch, batch[0], batch[1])
+    ### NEURAL NETWORK ###
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print("Device check. You are using:", device)
+    ### RÉSEAU ENTRAÎNÉ ###
+    # Pour s'assurer que les résultats seront les mêmes à chaque run du notebook
+    torch.use_deterministic_algorithms(True)
+    torch.manual_seed(0)
+    random.seed(0)
+    # On peut également entraîner encoder séparemment
+    encoder = Encoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device)
+    decoder = Decoder(len(vectoriser.idx_to_token) + 1, 256, 512, 0.5, device)
+    trained_classifier = EncoderDecoderModel(encoder, decoder, vectoriser, device).to(
+        device
+    )
+    print(next(trained_classifier.parameters()).device)
+    # print(train_dataset.is_cuda)
+    train_network(
+        trained_classifier,
+        train_dataset,
+        dev_dataset,
+        2,
+    )
+    torch.save(trained_classifier.state_dict(), "model/model.pt")
+    vectoriser.save("model/vocab.pkl")
+    print(f"test summary : {vectoriser.decode(dev_dataset[6][1])}")
+    print(
+        f"test prediction : {predict(trained_classifier, vectoriser.decode(dev_dataset[6][0]))}"
+    )