Get data and adapt it for training
- nettoyage de l'encodage
- Ajout de token <START> et <END>
- Nettoyage des contractions
- enlever les \xad
- enlever ponctuation et () []
- s'occuper des noms propres (mots commençant par une majuscule qui se suivent)
Création d'un Vectoriserà partir du vocabulaire :
import string
from collections import Counter
import pandas as pd
import torch
from nltk import word_tokenize
class Data:
A class used to get data from file
path : str
the path to the file containing the data
open the jsonl file with pandas
clean the data got by opening the file and adds <start> and
<end> tokens depending on the text_type
get the dataset vocabulary
create a dataset with cleaned data
def __init__(self, path: str) -> None:
self.path = path
def open(self) -> pd.DataFrame:
Open the file containing the data
return pd.read_json(path_or_buf=self.path, lines=True)
def clean_data(self, text_type: str) -> list:
Clean data from encoding error, punctuation, etc...
To Do :
#nettoyer les données
text_type : str
allow to differenciate between 'text' and 'summary'
to add <start> and <end> tokens to summaries
list of list
list of tokenised texts
dataset =
texts = dataset[text_type]
texts = texts.str.encode("cp1252", "ignore")
texts = texts.str.decode("utf-8", "ignore")
tokenized_texts = []
for text in texts:
text = text.translate(str.maketrans("", "", string.punctuation))
text = word_tokenize(text)
if text_type == "summary":
return [["<start>", *summary, "<end>"] for summary in tokenized_texts]
return tokenized_texts
def pad_sequence(self):
pad summary with empty token
texts = self.clean_data("text")
summaries = self.clean_data("summary")
padded_summary = []
for text, summary in zip(texts, summaries):
if len(summary) != len(text):
summary += ["<empty>"] * (len(text) - len(summary))
return texts, padded_summary
def get_words(self) -> list:
Create a dictionnary of the data vocabulary
texts, summaries = self.clean_data("text"), self.clean_data("summary")
text_words = [word for text in texts for word in text]
summary_words = [word for text in summaries for word in text]
return text_words + summary_words
def make_dataset(self) -> pd.DataFrame:
Create a Pandas Dataframe with cleaned data
param: self: Data
return: pd.DataFrame
texts, summaries = self.clean_data("text"), self.clean_data("summary")
return pd.DataFrame(list(zip(texts, summaries)), columns=["text", "summary"])
class Vectoriser:
A class used to vectorise data
vocab : list
list of the vocab
transforms a list of tokens to their corresponding idx
in form of troch tensor
converts a tensor to a list of tokens
encode an entire row from the dataset
def __init__(self, vocab) -> None:
self.vocab = vocab
self.word_count = Counter(word.lower().strip(",.\\-") for word in self.vocab)
self.idx_to_token = sorted([t for t, c in self.word_count.items() if c > 1])
self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}
def encode(self, tokens) -> torch.tensor:
Encode une phrase selon les mots qu'elle contient
selon les mots contenus dans le dictionnaire.
Si un mot n'est pas contenu dans le dictionnaire,
associe un index fixe au mot qui sera ignoré au décodage.
:params: tokens : list
les mots de la phrase sous forme de liste
:return: words_idx : tensor
Un tensor contenant les index des mots de la phrase
if type(tokens) == list:
words_idx = torch.tensor(
self.token_to_idx.get(t.lower(), len(self.token_to_idx))
for t in tokens
# Permet d'encoder mots par mots
elif type(tokens) == str:
words_idx = torch.tensor(self.token_to_idx.get(tokens.lower()))
return words_idx
def decode(self, words_idx_tensor) -> list:
Decode une phrase selon le procédé inverse que la fonction encode
words_idx_tensor = words_idx_tensor.argmax(dim=-1)
idxs = words_idx_tensor.tolist()
if type(idxs) == int:
words = [self.idx_to_token[idxs]]
words = []
for idx in idxs:
if idx != len(self.idx_to_token):
return words
def vectorize(self, row) -> torch.tensor:
Encode les données d'une ligne du dataframe
:params: row : dataframe
une ligne du dataframe (un coupe texte-résumé)
:returns: text_idx : tensor
le tensor correspondant aux mots du textes
:returns: summary_idx: tensor
le tensr correspondant aux mots du résumé
text_idx = self.encode(row["text"])
summary_idx = self.encode(row["summary"])
return (text_idx, summary_idx)