Spaces:

EveSa
/

SummaryProject

Runtime error

File size: 7,407 Bytes

"""
    Get data and adapt it for training
    -----------
    - nettoyage de l'encodage
    - Ajout de token <START> et <END>
    TO DO :
    - Nettoyage des contractions
    - enlever les \xad
    - enlever ponctuation et () []
    - s'occuper des noms propres (mots commençant par une majuscule qui se suivent)
    Création d'un Vectoriserà partir du vocabulaire :

"""
import pickle
import string
from collections import Counter

import pandas as pd
import torch


class Data(torch.utils.data.Dataset):
    """
    A class used to get data from file
    ...

    Attributes
    ----------
    path : str
        the path to the file containing the data

    Methods
    -------
    open()
        open the jsonl file with pandas
    clean_data(text_type)
        clean the data got by opening the file and adds <start> and
        <end> tokens depending on the text_type
    get_words()
        get the dataset vocabulary
    """

    def __init__(self, path: str, transform=None) -> None:
        self.path = path
        self.data = pd.read_json(path_or_buf=self.path, lines=True)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = row["text"].translate(
            str.maketrans(
                "", "", string.punctuation)).split()
        summary = (
            row["summary"].translate(
                str.maketrans(
                    "",
                    "",
                    string.punctuation)).split())
        summary = ["<start>", *summary, "<end>"]
        sample = {"text": text, "summary": summary}

        if self.transform:
            sample = self.transform(sample)

        return sample

    def open(self) -> pd.DataFrame:
        """
        Open the file containing the data
        """
        return pd.read_json(path_or_buf=self.path, lines=True)

    def clean_data(self, text_type: str) -> list:
        """
        Clean data from encoding error, punctuation, etc...
        To Do :
        #nettoyer les données

        Parameters
        ----------
        text_type : str
            allow to differenciate between 'text' and 'summary'
            to add <start> and <end> tokens to summaries

        Returns
        ----------
        list of list
            list of tokenised texts

        """
        dataset = self.open()

        texts = dataset[text_type]
        texts = texts.str.encode("cp1252", "ignore")
        texts = texts.str.decode("utf-8", "ignore")

        tokenized_texts = []
        # - Nettoyage des contractions
        # - enlever les \xad
        # text.translate(str.maketrans('', '', string.punctuation))
        # - enlever ponctuation et () []
        # - s'occuper des noms propres (mots commençant par une majuscule qui se suivent)
        for text in texts:
            text = text.translate(str.maketrans("", "", string.punctuation))
            text = text.split()
            tokenized_texts.append(text)

        if text_type == "summary":
            return [["<start>", *summary, "<end>"]
                    for summary in tokenized_texts]
        return tokenized_texts

    def get_words(self) -> list:
        """
        Create a dictionnary of the data vocabulary
        """
        texts, summaries = self.clean_data("text"), self.clean_data("summary")
        text_words = [word for text in texts for word in text]
        summary_words = [word for text in summaries for word in text]
        return text_words + summary_words


def pad_collate(data):
    text_batch = [element[0] for element in data]
    summary_batch = [element[1] for element in data]
    max_len = max([len(element) for element in summary_batch + text_batch])
    text_batch = [
        torch.nn.functional.pad(element, (0, max_len - len(element)), value=-100)
        for element in text_batch
    ]
    summary_batch = [
        torch.nn.functional.pad(element, (0, max_len - len(element)), value=-100)
        for element in summary_batch
    ]
    return text_batch, summary_batch


class Vectoriser:
    """
    A class used to vectorise data
    ...

    Attributes
    ----------
    vocab : list
        list of the vocab

    Methods
    -------
    encode(tokens)
        transforms a list of tokens to their corresponding idx
        in form of troch tensor
    decode(word_idx_tensor)
        converts a tensor to a list of tokens
    vectorize(row)
        encode an entire row from the dataset
    """

    def __init__(self, vocab=None) -> None:
        self.vocab = vocab
        self.word_count = Counter(word.lower().strip(",.\\-")
                                  for word in self.vocab)
        self.idx_to_token = sorted(
            [t for t, c in self.word_count.items() if c > 1])
        self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}

    def load(self, path):
        with open(path, "rb") as file:
            self.vocab = pickle.load(file)
            self.word_count = Counter(
                word.lower().strip(",.\\-") for word in self.vocab
            )
            self.idx_to_token = sorted(
                [t for t, c in self.word_count.items() if c > 1])
            self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}

    def save(self, path):
        with open(path, "wb") as file:
            pickle.dump(self.vocab, file)

    def encode(self, tokens) -> torch.tensor:
        """
        Encode une phrase selon les mots qu'elle contient
        selon les mots contenus dans le dictionnaire.
        À NOTER :
        Si un mot n'est pas contenu dans le dictionnaire,
        associe un index fixe au mot qui sera ignoré au décodage.
        ---------
        :params: tokens : list
            les mots de la phrase sous forme de liste
        :return: words_idx : tensor
            Un tensor contenant les index des mots de la phrase
        """
        if isinstance(tokens, list):
            words_idx = torch.tensor(
                [
                    self.token_to_idx.get(t.lower(), len(self.token_to_idx))
                    for t in tokens
                ],
                dtype=torch.long,
            )

        # Permet d'encoder mots par mots
        elif isinstance(tokens, str):
            words_idx = torch.tensor(self.token_to_idx.get(tokens.lower()))

        return words_idx

    def decode(self, words_idx_tensor) -> list:
        """
        Decode une phrase selon le procédé inverse que la fonction encode
        """

        idxs = words_idx_tensor.tolist()
        if isinstance(idxs, int):
            words = [self.idx_to_token[idxs]]
        else:
            words = []
            for idx in idxs:
                if idx != len(self.idx_to_token):
                    words.append(self.idx_to_token[idx])
        return words

    def __call__(self, row) -> torch.tensor:
        """
        Encode les données d'une ligne du dataframe
        ----------
        :params: row : dataframe
            une ligne du dataframe (un coupe texte-résumé)
        :returns: text_idx : tensor
            le tensor correspondant aux mots du textes
        :returns: summary_idx: tensor
            le tensr correspondant aux mots du résumé
        """
        text_idx = self.encode(row["text"])
        summary_idx = self.encode(row["summary"])
        return (text_idx, summary_idx)