Spaces:

Hetan07
/

trial

Runtime error

App Files Files Community

trial / preprocessing.py

Hetan07

Update preprocessing.py

d2116e5 8 months ago

raw history blame contribute delete

No virus

20.6 kB

	# global
	import string
	from typing import List, Tuple

	import numpy as np
	import pandas as pd

	import re
	import nltk

	from sklearn.utils import resample

	from gensim.models.doc2vec import Doc2Vec, TaggedDocument
	from nltk.tokenize import RegexpTokenizer

	import tensorflow as tf
	from keras.layers import TextVectorization
	from keras.preprocessing.text import Tokenizer
	from keras.utils import pad_sequences

	# local
	from utils import Doc2VecModel

	nltk.download('stopwords')
	nltk.download('punkt')
	punct = string.punctuation
	stemmer = nltk.stem.PorterStemmer()
	eng_stopwords = nltk.corpus.stopwords.words("english")


	class Preprocessor:
	"""Responsible for preprocessing case facts."""

	def __init__(self) -> None:
	pass

	def _nltk_tokenizer(self, text: str) -> List[str]:
	"""
	Tokenize a given `text` using the RegexpTokenizer from the nltk library.

	Parameters:
	-----------
	- text : str
	A string containing the text to be tokenized.

	Returns:
	--------
	- tokens : List[str]
	A list of tokens generated by the tokenizer.
	"""

	tokenizer = RegexpTokenizer(r"\w+")
	tokens = tokenizer.tokenize(text)

	return tokens

	def _tokenize_text(self, text_column: pd.Series) -> pd.Series:
	"""Splitting `text_column` into tokens.

	Parameters:
	------------
	- text_column : pd.Series
	Contains text that needs to be tokenized.

	Returns:
	--------
	- tokenized_text : pd.Series
	Contains tokenized version of `text_column`.
	"""

	tokenized_text = text_column.apply(self._nltk_tokenizer)
	return tokenized_text

	def _convert_to_tagged_document(
	self, text_column: pd.Series
	) -> Tuple[List[str], List[TaggedDocument]]:
	"""
	Convert `text_column` of specific to TaggedDocuments.

	Parameters:
	------------
	- column : pd.Series
	Contains the list of tokens of each fact.

	Returns:
	--------
	A tuble containing the following items:
	- tokens_list : list[str]
	Contains all tokens of each case in the `text_column`.
	- tagged_docs : list[TaggedDocument]
	Contains TaggedDocument object for each case.
	"""

	tokens_list = text_column.to_list()
	tagged_docs = [TaggedDocument(t, [str(i)])
	for i, t in enumerate(tokens_list)]

	return tokens_list, tagged_docs

	def _vectorize_text(
	self, doc2vec_model: Doc2Vec, df: pd.Series, tokens_list: List[str]
	) -> pd.DataFrame:
	"""
	Convert values of `tokens_list` to a vector.

	Parameters:
	-----------
	- doc2vec_model : Doc2Vev
	Trained Doc2Vec model.
	- df : pd.Series
	This will use only to get its indicies for the new generated dataframe.
	- tokens_list : List[str]
	Contains all tokens of each case.

	Returns:
	--------
	- text_vectors_df : pd.DataFrame
	Contains the vector representaion for each case.
	"""

	text_vectors = [doc2vec_model.infer_vector(doc) for doc in tokens_list]
	text_vectors_df = pd.DataFrame(text_vectors, index=df.index)

	return text_vectors_df

	def _anonymize_case_facts(
	self, first_party_name: str, second_party_name: str, facts: str
	) -> str:
	"""
	Anonymize case facts by replacing its party names with "_PARTY_" tag.

	Parameters:
	------------
	- first_party_name : str
	Represents first party name or petitioner name.
	- second_party_name : str
	Represents second party name or respondent name.
	- facts : str
	Represents case facts.

	Returns:
	--------
	- anonymized_facts : str
	An anonymized version of `facts`.
	"""

	# remove any commas and any non alphabet characters
	first_party_name = re.sub(r"[\,+]", " ", first_party_name)
	first_party_name = re.sub(r"[^a-zA-Z]", " ", first_party_name)

	second_party_name = re.sub(r"[\,+]", " ", second_party_name)
	second_party_name = re.sub(r"[^a-zA-Z]", " ", second_party_name)

	for name in first_party_name.split():
	facts = re.sub(name, " _PARTY_ ", facts)

	for name in second_party_name.split():
	facts = re.sub(name, " _PARTY_ ", facts)

	# replace any consecutive _PARTY_ tags with only one _PARTY_ tag.
	regex_continous_tags = r"(_PARTY_\s+){2,}"
	anonymized_facts = re.sub(regex_continous_tags, " _PARTY_ ", facts)
	# remove ant consecutive spaces
	anonymized_facts = re.sub(r"\s+", " ", anonymized_facts)

	return anonymized_facts

	def _preprocess_text(self, text: str) -> str:
	"""
	Preprocessing & cleaning `text` including:
	- lowercasing
	- removing quotation marks
	- removing digits
	- removing punctuation
	- removing brackets, braces, and paranthesis
	- removeing stopwords
	- stemming tokens

	Parameters:
	------------
	- text : str
	Text need to be processed (cleaned).

	Returns:
	--------
	- processed_text : str
	A preprocessed version of `text`.
	"""

	text = text.lower()
	# remove quotation marks
	text = re.sub(r"\'", "", text)
	# remove digits
	text = re.sub(r"\d+", "", text)
	# remove punctuation but with keeping '_' letter
	text = "".join([ch for ch in text if (ch == "_") or (ch not in punct)])
	# remove brackets, braces, and parantheses
	text = re.sub(r"[\[\]\(\)\{\}]+", " ", text)
	tokens = nltk.word_tokenize(text)
	# remove stopwords and stemming tokens
	tokens = [stemmer.stem(token)
	for token in tokens if token not in eng_stopwords]
	# convert tokens back to string
	processed_text = " ".join(tokens)

	return processed_text

	def convert_text_to_vectors_doc2vec(
	self,
	text_column: pd.Series,
	train: bool = True,
	embeddings_doc2vec: Doc2Vec = None,
	) -> Tuple[Doc2Vec, pd.DataFrame] \| pd.DataFrame:
	"""
	Converting `text_column` to vectors using `Doc2Vec` model

	Parameters:
	------------
	- text_column : pd.Series
	Contains the case facts.
	- train : bool, optional
	Defines whether the model will be trained or not. (if True, Doc2Vec will be trained \|
	else, Doc2Vec will used the passed `embeddings_Doc2Vec`). (Default is True).
	- embeddings_doc2vec : Doc2Vec, optional
	Trained Doc2Vec model will be used for generating embeddings of `text_column` if
	`train` is False. (Default is None).

	Returns:
	--------
	1. A tuple contains the following:
	- embeddings_doc2vec : Doc2Vec
	Trained Doc2Vec model.
	- text_vectors_df : pd.DataFrame
	A DataFrame contains `text_column` vectors if `train` is True.

	2. text_vectors_df : pd.DataFrame
	A DataFrame contains `text_column` vectors if `train` is False.

	Raises:
	-------
	- AssertionError
	If train is False and `embeddings_doc2vec` is None.
	- AssertionError
	If train is False and `embedding_doc2vec` is not an instance of Doc2Vec
	"""

	tokenized_text = self._tokenize_text(text_column)
	tokens_list, tagged_docs = self._convert_to_tagged_document(
	tokenized_text)

	if train:
	doc2vec_model = Doc2VecModel()
	embeddings_doc2vec = doc2vec_model.train_doc2vec_embeddings_model(
	tagged_docs
	)
	text_vectors_df = self._vectorize_text(
	embeddings_doc2vec, text_column, tokens_list
	)
	return embeddings_doc2vec, text_vectors_df

	assert (
	embeddings_doc2vec is not None
	), "`embedding_doc2vec` argument must be not None."
	assert isinstance(
	embeddings_doc2vec, Doc2Vec
	), "`embedding_doc2vec` argument must be an instance of Doc2Vec to infer vectors."
	text_vectors_df = self._vectorize_text(
	embeddings_doc2vec, text_column, tokens_list
	)

	return text_vectors_df

	def convert_text_to_vectors_tf_idf(
	self,
	text_column: pd.Series,
	ngrams: int = 2,
	max_tokens: int = 10000,
	output_mode: str = "tf-idf",
	train: bool = True,
	text_vectorizer: TextVectorization = None,
	) -> Tuple[TextVectorization, tf.Tensor] \| tf.Tensor:
	"""
	Converting `text_column` to vectors using `TextVectorization` layer.

	Parameters:
	------------
	- text_column : pd.Series
	Contains the case facts.
	- ngrams : int, optional
	Defines the number of n-gram (Default is 2).
	- max_tokens : int, optional
	Defines the number of max_tokens of `text_vectorizer` (Default is 10,000).
	- output_mode : str, optional
	Represents the output vectors type whether it is "tfi-df" or "binary" or "count"
	(Default is "tf-idf").
	- train : bool, optional
	Defines whether the model will be trained or not. (if True, TextVectorization
	will be trained, else, TextVectorization will used the passed `text_vectorizer`).
	(Default is True).
	- text_vectorizer : TextVectorization, optional
	Trained TextVectorization layer will be used for generating embeddings of
	`text_column` if `train` is False. (Default is None).

	Returns:
	--------
	- if `train` == True:
	A tuple contains the following:
	- text_vectorizer : TextVectorization
	Trained TextVectorization layer.
	- text_vectors : tf.Tensor
	A Tensor contains `text_column` training vectors.
	- otherwise:
	text_vectors : tf.Tensor
	A Tensor contains `text_column` testing vectors.

	Raises:
	-------
	- AssertionError
	If train is False and `text_vectorizer` is None.
	- AssertionError
	If train is False and `text_vectorizer` is not an instance of TextVectorization.
	"""

	if train:
	text_vectorizer = TextVectorization(
	ngrams=ngrams, max_tokens=max_tokens, output_mode=output_mode
	)
	text_vectorizer.adapt(text_column)
	text_vectors = text_vectorizer(text_column)

	return text_vectorizer, text_vectors

	assert (
	text_vectorizer is not None
	), "`text_vectorizer` argument must be not None."
	assert isinstance(
	text_vectorizer, TextVectorization
	), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
	text_vectors = text_vectorizer(text_column)

	return text_vectors

	def convert_text_to_vectors_cnn(
	self,
	text_column: pd.Series,
	max_tokens: int = 2000,
	output_sequence_length: int = 500,
	output_mode: str = "int",
	train: bool = True,
	text_vectorizer: TextVectorization = None,
	) -> Tuple[TextVectorization, tf.Tensor] \| tf.Tensor:
	"""
	Converting `text_column` to vectors using `TextVectorization` layer.

	Parameters:
	------------
	- text_column : pd.Series
	Contains the case facts.
	- max_tokens : int, optional
	Defines the number of max_tokens of `text_vectorizer` (Default is 2000).
	- output_sequence_length : int, optional
	Represents the dimensions of the output vector (Default is 500).
	- output_mode : str, optional
	Represents the output vectors type whether it is "int" or "binary" or "tfi-df".
	- train : bool, optional
	Defines whether the model will be trained or not. (if True,
	TextVectorization will be trained \| else, TextVectorization will used the
	passed `text_vectorizer`). (Default is True).
	- text_vectorizer : TextVectorization, optional
	Trained TextVectorization layer will be used for generating embeddings of
	`text_column` if `train` is False. (Default is None).

	Returns:
	--------
	- if `train` == True:
	A tuple contains the following:
	- text_vectorizer : TextVectorization
	Trained TextVectorization layer.
	- text_vectors : tf.Tensor
	A Tensor contains `text_column` training vectors.
	- otherwise:
	text_vectors : tf.Tensor
	A Tensor contains `text_column` testing vectors.

	Raises:
	-------
	- AssertionError
	If train is False and `text_vectorizer` is None.
	- AssertionError
	If train is False and `text_vectorizer` is not an instance of TextVectorization.
	"""

	if train:
	text_vectorizer = TextVectorization(
	max_tokens=max_tokens,
	output_mode=output_mode,
	output_sequence_length=output_sequence_length,
	)
	text_vectorizer.adapt(text_column)
	text_vectors = text_vectorizer(text_column)
	return text_vectorizer, text_vectors

	assert (
	text_vectorizer is not None
	), "`text_vectorizer` argument must be not None."
	assert isinstance(
	text_vectorizer, TextVectorization
	), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
	text_vectors = text_vectorizer(text_column)

	return text_vectors

	def convert_text_to_vectors_glove(
	self,
	text_column: pd.Series,
	train: bool = True,
	glove_tokenizer: Tokenizer = None,
	vocab_size: int = 1000,
	oov_token: str = "<OOV>",
	max_length: int = 50,
	padding_type: str = "post",
	truncation_type: str = "post",
	) -> Tuple[Tokenizer, np.ndarray] \| np.ndarray:
	"""
	Converting `text_column` to vectors using `glove_tokenizer`.

	Parameters:
	------------
	- text_column : pd.Series
	Contains the case facts.
	- train : bool, optional
	Defines whether the model will be trained or not. (if True,
	Tokenizer will be trained \| else, Tokenizer will used the
	passed `glove_tokenizer`). (Default is True).
	- glove_tokenizer : Tokenizer, optional
	Trained Tokenizer layer will be used for generating embeddings of
	`text_column` if `train` is False. (Default is None).
	- vocab_size : int, optional
	Represents the number of supported vocabulary of the Tokenizer,
	any token not in this vocabulary will be treated as an out-of-vocabulary
	token(OOV). (Default is 1000).
	- oov_tokens : str, optional
	Represents the token of an out-of-vocabulary token (Default is "<OOV>").
	- max_length : int, optional
	Defins the output vector's dimension. (Default is 50).
	- padding_type : str, optional
	Defines the padding type of the vectors, if the vector size is less than
	`max_length`, the rest of the `max_length` will be padded with 0 (Default is "post").
	- truncation_type : str, optional
	Defines the truncation type of the vectors, if the vector size is more than
	`max_length`, the extra of the `max_length` will be truncated (Default is "post").

	Returns:
	--------
	- if `train` == True:
	A tuple contains the following:
	- glove_tokenizer : Tokenizer
	Trained Tokenizer layer.
	- text_padded : np.ndarray
	An array contains `text_column` vectors.
	- otherwise:
	text_padded : np.ndarray
	An array contains `text_column` vectors.

	Raises:
	-------
	- AssertionError
	If train is False and `glove_tokenizer` is None.
	- AssertionError
	If train is False and `glove_tokenizer` is not instance of Tokenizer.
	"""

	if train:
	glove_tokenizer = Tokenizer(
	num_words=vocab_size, oov_token=oov_token)
	glove_tokenizer.fit_on_texts(text_column)
	text_sequences = glove_tokenizer.texts_to_sequences(text_column)
	text_padded = pad_sequences(
	text_sequences,
	maxlen=max_length,
	padding=padding_type,
	truncating=truncation_type,
	)

	return glove_tokenizer, text_padded

	assert (
	glove_tokenizer is not None
	), "`glove_tokenizer` argument must be not None."
	assert isinstance(
	glove_tokenizer, Tokenizer
	), "`glove_tokenizer` argument must be an instance of Tokenizer."
	text_sequences = glove_tokenizer.texts_to_sequences(text_column)
	text_padded = pad_sequences(
	text_sequences,
	maxlen=max_length,
	padding=padding_type,
	truncating=truncation_type,
	)

	return text_padded

	def balance_data(self, X_train: pd.Series, y_train: pd.Series) -> pd.DataFrame:
	"""
	Balancing `X_train` and `y_train` to distribute the targets in `y_train` equally.

	Parameters:
	------------
	- text_column : pd.Series
	Contains the case facts.
	- y_train : pd.Series
	Contains the training targets.

	Returns:
	--------
	- shuffled_balanced_df : pd.DataFrame
	Contains the new balanced dataframe with shuffling indicies.
	"""

	df = pd.concat([X_train, y_train], axis=1)

	first_party = df[df["winner_index"] == 0]
	second_party = df[df["winner_index"] == 1]

	upsample_second_party = resample(
	second_party, replace=True, n_samples=len(first_party), random_state=42
	)

	upsample_df = pd.concat([upsample_second_party, first_party])

	shuffled_indices = np.arange(upsample_df.shape[0])
	np.random.shuffle(shuffled_indices)

	shuffled_balanced_df = upsample_df.iloc[shuffled_indices, :]

	return shuffled_balanced_df

	def anonymize_data(
	self,
	first_party_names: pd.Series,
	second_party_names: pd.Series,
	text_column: pd.Series,
	) -> pd.Series:
	"""
	Anonymize `text_column` by replacing `first_party_names` and
	`second_party_names` wit "_PARTY_" tag.

	Parameters:
	------------
	- first_party_names : pd.Series
	Contains all first party names needed to be anonymized.
	- second_party_names : pd.Series
	Contains all second party names needed to be anonymized.
	- text_column : pd.Series
	Contains all texts needed to be anonymized.

	Returns:
	--------
	- all_anonyimzed_facts : pd.Series
	Contains anonymized version of `text_column`.
	"""

	all_anonymized_facts = []

	for i in range(text_column.shape[0]):
	facts = text_column.iloc[i]
	first_party_name = first_party_names.iloc[i]
	second_party_name = second_party_names.iloc[i]
	anonymized_facts = self._anonymize_case_facts(
	first_party_name, second_party_name, facts
	)
	all_anonymized_facts.append(anonymized_facts)

	return pd.Series(all_anonymized_facts)

	def preprocess_data(self, text_column: pd.Series) -> pd.Series:
	"""
	Preprocessing & cleaning all texts in `text_column`.

	Parameters:
	------------
	- text_column : pd.Series
	Contains all case facts.

	Returns:
	--------
	- preprocessed_text : pd.Series
	Contains all texts after being processed.
	"""

	preprocessed_text = text_column.apply(self._preprocess_text)
	return preprocessed_text