Spaces:

EveSa
/

SummaryProject

Runtime error

App Files Files Community

SummaryProject / src /dataloader.py

EveSa

Initial Commit

ad78747 almost 2 years ago

raw

history blame

6.56 kB

	"""
	Get data and adapt it for training
	-----------
	- nettoyage de l'encodage
	- Ajout de token <START> et <END>
	TO DO :
	- Nettoyage des contractions
	- enlever les \xad
	- enlever ponctuation et () []
	- s'occuper des noms propres (mots commençant par une majuscule qui se suivent)
	Création d'un Vectoriserà partir du vocabulaire :

	"""
	import string
	from collections import Counter

	import pandas as pd
	import torch
	from nltk import word_tokenize

	# nltk.download('punkt')


	class Data:
	"""
	A class used to get data from file
	...

	Attributes
	----------
	path : str
	the path to the file containing the data

	Methods
	-------
	open()
	open the jsonl file with pandas
	clean_data(text_type)
	clean the data got by opening the file and adds <start> and
	<end> tokens depending on the text_type
	get_words()
	get the dataset vocabulary
	make_dataset()
	create a dataset with cleaned data
	"""

	def __init__(self, path: str) -> None:
	self.path = path

	def open(self) -> pd.DataFrame:
	"""
	Open the file containing the data
	"""
	return pd.read_json(path_or_buf=self.path, lines=True)

	def clean_data(self, text_type: str) -> list:
	"""
	Clean data from encoding error, punctuation, etc...
	To Do :
	#nettoyer les données

	Parameters
	----------
	text_type : str
	allow to differenciate between 'text' and 'summary'
	to add <start> and <end> tokens to summaries

	Returns
	----------
	list of list
	list of tokenised texts

	"""
	dataset = self.open()

	texts = dataset[text_type]
	texts = texts.str.encode("cp1252", "ignore")
	texts = texts.str.decode("utf-8", "ignore")

	tokenized_texts = []
	# - Nettoyage des contractions
	# - enlever les \xad
	# text.translate(str.maketrans('', '', string.punctuation))
	# - enlever ponctuation et () []
	# - s'occuper des noms propres (mots commençant par une majuscule qui se suivent)
	for text in texts:
	text = text.translate(str.maketrans("", "", string.punctuation))
	text = word_tokenize(text)
	tokenized_texts.append(text)

	if text_type == "summary":
	return [["<start>", *summary, "<end>"] for summary in tokenized_texts]
	return tokenized_texts

	def pad_sequence(self):
	"""
	pad summary with empty token
	"""
	texts = self.clean_data("text")
	summaries = self.clean_data("summary")
	padded_summary = []
	for text, summary in zip(texts, summaries):
	if len(summary) != len(text):
	summary += ["<empty>"] * (len(text) - len(summary))
	padded_summary.append(summary)
	return texts, padded_summary

	def get_words(self) -> list:
	"""
	Create a dictionnary of the data vocabulary
	"""
	texts, summaries = self.clean_data("text"), self.clean_data("summary")
	text_words = [word for text in texts for word in text]
	summary_words = [word for text in summaries for word in text]
	return text_words + summary_words

	def make_dataset(self) -> pd.DataFrame:
	"""
	Create a Pandas Dataframe with cleaned data
	--------------------
	param: self: Data
	return: pd.DataFrame
	"""
	texts, summaries = self.clean_data("text"), self.clean_data("summary")
	return pd.DataFrame(list(zip(texts, summaries)), columns=["text", "summary"])


	class Vectoriser:
	"""
	A class used to vectorise data
	...

	Attributes
	----------
	vocab : list
	list of the vocab

	Methods
	-------
	encode(tokens)
	transforms a list of tokens to their corresponding idx
	in form of troch tensor
	decode(word_idx_tensor)
	converts a tensor to a list of tokens
	vectorize(row)
	encode an entire row from the dataset
	"""

	def __init__(self, vocab) -> None:
	self.vocab = vocab
	self.word_count = Counter(word.lower().strip(",.\\-") for word in self.vocab)
	self.idx_to_token = sorted([t for t, c in self.word_count.items() if c > 1])
	self.token_to_idx = {t: i for i, t in enumerate(self.idx_to_token)}

	def encode(self, tokens) -> torch.tensor:
	"""
	Encode une phrase selon les mots qu'elle contient
	selon les mots contenus dans le dictionnaire.
	À NOTER :
	Si un mot n'est pas contenu dans le dictionnaire,
	associe un index fixe au mot qui sera ignoré au décodage.
	---------
	:params: tokens : list
	les mots de la phrase sous forme de liste
	:return: words_idx : tensor
	Un tensor contenant les index des mots de la phrase
	"""
	if type(tokens) == list:
	words_idx = torch.tensor(
	[
	self.token_to_idx.get(t.lower(), len(self.token_to_idx))
	for t in tokens
	],
	dtype=torch.long,
	)

	# Permet d'encoder mots par mots
	elif type(tokens) == str:
	words_idx = torch.tensor(self.token_to_idx.get(tokens.lower()))

	return words_idx

	def decode(self, words_idx_tensor) -> list:
	"""
	Decode une phrase selon le procédé inverse que la fonction encode
	"""
	words_idx_tensor = words_idx_tensor.argmax(dim=-1)
	idxs = words_idx_tensor.tolist()
	if type(idxs) == int:
	words = [self.idx_to_token[idxs]]
	else:
	words = []
	for idx in idxs:
	if idx != len(self.idx_to_token):
	words.append(self.idx_to_token[idx])
	return words

	def beam_search(self, words_idx_tensor) -> list:
	pass

	def vectorize(self, row) -> torch.tensor:
	"""
	Encode les données d'une ligne du dataframe
	----------
	:params: row : dataframe
	une ligne du dataframe (un coupe texte-résumé)
	:returns: text_idx : tensor
	le tensor correspondant aux mots du textes
	:returns: summary_idx: tensor
	le tensr correspondant aux mots du résumé
	"""
	text_idx = self.encode(row["text"])
	summary_idx = self.encode(row["summary"])
	return (text_idx, summary_idx)