# global import string from typing import List, Tuple import numpy as np import pandas as pd import re import nltk from sklearn.utils import resample from gensim.models.doc2vec import Doc2Vec, TaggedDocument from nltk.tokenize import RegexpTokenizer import tensorflow as tf from keras.layers import TextVectorization from keras.preprocessing.text import Tokenizer from keras.utils import pad_sequences # local from utils import Doc2VecModel nltk.download('stopwords') punct = string.punctuation stemmer = nltk.stem.PorterStemmer() eng_stopwords = nltk.corpus.stopwords.words("english") class Preprocessor: """Responsible for preprocessing case facts.""" def __init__(self) -> None: pass def _nltk_tokenizer(self, text: str) -> List[str]: """ Tokenize a given `text` using the RegexpTokenizer from the nltk library. Parameters: ----------- - text : str A string containing the text to be tokenized. Returns: -------- - tokens : List[str] A list of tokens generated by the tokenizer. """ tokenizer = RegexpTokenizer(r"\w+") tokens = tokenizer.tokenize(text) return tokens def _tokenize_text(self, text_column: pd.Series) -> pd.Series: """Splitting `text_column` into tokens. Parameters: ------------ - text_column : pd.Series Contains text that needs to be tokenized. Returns: -------- - tokenized_text : pd.Series Contains tokenized version of `text_column`. """ tokenized_text = text_column.apply(self._nltk_tokenizer) return tokenized_text def _convert_to_tagged_document( self, text_column: pd.Series ) -> Tuple[List[str], List[TaggedDocument]]: """ Convert `text_column` of specific to TaggedDocuments. Parameters: ------------ - column : pd.Series Contains the list of tokens of each fact. Returns: -------- A tuble containing the following items: - tokens_list : list[str] Contains all tokens of each case in the `text_column`. - tagged_docs : list[TaggedDocument] Contains TaggedDocument object for each case. """ tokens_list = text_column.to_list() tagged_docs = [TaggedDocument(t, [str(i)]) for i, t in enumerate(tokens_list)] return tokens_list, tagged_docs def _vectorize_text( self, doc2vec_model: Doc2Vec, df: pd.Series, tokens_list: List[str] ) -> pd.DataFrame: """ Convert values of `tokens_list` to a vector. Parameters: ----------- - doc2vec_model : Doc2Vev Trained Doc2Vec model. - df : pd.Series This will use only to get its indicies for the new generated dataframe. - tokens_list : List[str] Contains all tokens of each case. Returns: -------- - text_vectors_df : pd.DataFrame Contains the vector representaion for each case. """ text_vectors = [doc2vec_model.infer_vector(doc) for doc in tokens_list] text_vectors_df = pd.DataFrame(text_vectors, index=df.index) return text_vectors_df def _anonymize_case_facts( self, first_party_name: str, second_party_name: str, facts: str ) -> str: """ Anonymize case facts by replacing its party names with "_PARTY_" tag. Parameters: ------------ - first_party_name : str Represents first party name or petitioner name. - second_party_name : str Represents second party name or respondent name. - facts : str Represents case facts. Returns: -------- - anonymized_facts : str An anonymized version of `facts`. """ # remove any commas and any non alphabet characters first_party_name = re.sub(r"[\,+]", " ", first_party_name) first_party_name = re.sub(r"[^a-zA-Z]", " ", first_party_name) second_party_name = re.sub(r"[\,+]", " ", second_party_name) second_party_name = re.sub(r"[^a-zA-Z]", " ", second_party_name) for name in first_party_name.split(): facts = re.sub(name, " _PARTY_ ", facts) for name in second_party_name.split(): facts = re.sub(name, " _PARTY_ ", facts) # replace any consecutive _PARTY_ tags with only one _PARTY_ tag. regex_continous_tags = r"(_PARTY_\s+){2,}" anonymized_facts = re.sub(regex_continous_tags, " _PARTY_ ", facts) # remove ant consecutive spaces anonymized_facts = re.sub(r"\s+", " ", anonymized_facts) return anonymized_facts def _preprocess_text(self, text: str) -> str: """ Preprocessing & cleaning `text` including: - lowercasing - removing quotation marks - removing digits - removing punctuation - removing brackets, braces, and paranthesis - removeing stopwords - stemming tokens Parameters: ------------ - text : str Text need to be processed (cleaned). Returns: -------- - processed_text : str A preprocessed version of `text`. """ text = text.lower() # remove quotation marks text = re.sub(r"\'", "", text) # remove digits text = re.sub(r"\d+", "", text) # remove punctuation but with keeping '_' letter text = "".join([ch for ch in text if (ch == "_") or (ch not in punct)]) # remove brackets, braces, and parantheses text = re.sub(r"[\[\]\(\)\{\}]+", " ", text) tokens = nltk.word_tokenize(text) # remove stopwords and stemming tokens tokens = [stemmer.stem(token) for token in tokens if token not in eng_stopwords] # convert tokens back to string processed_text = " ".join(tokens) return processed_text def convert_text_to_vectors_doc2vec( self, text_column: pd.Series, train: bool = True, embeddings_doc2vec: Doc2Vec = None, ) -> Tuple[Doc2Vec, pd.DataFrame] | pd.DataFrame: """ Converting `text_column` to vectors using `Doc2Vec` model Parameters: ------------ - text_column : pd.Series Contains the case facts. - train : bool, optional Defines whether the model will be trained or not. (if True, Doc2Vec will be trained | else, Doc2Vec will used the passed `embeddings_Doc2Vec`). (Default is True). - embeddings_doc2vec : Doc2Vec, optional Trained Doc2Vec model will be used for generating embeddings of `text_column` if `train` is False. (Default is None). Returns: -------- 1. A tuple contains the following: - embeddings_doc2vec : Doc2Vec Trained Doc2Vec model. - text_vectors_df : pd.DataFrame A DataFrame contains `text_column` vectors if `train` is True. 2. text_vectors_df : pd.DataFrame A DataFrame contains `text_column` vectors if `train` is False. Raises: ------- - AssertionError If train is False and `embeddings_doc2vec` is None. - AssertionError If train is False and `embedding_doc2vec` is not an instance of Doc2Vec """ tokenized_text = self._tokenize_text(text_column) tokens_list, tagged_docs = self._convert_to_tagged_document( tokenized_text) if train: doc2vec_model = Doc2VecModel() embeddings_doc2vec = doc2vec_model.train_doc2vec_embeddings_model( tagged_docs ) text_vectors_df = self._vectorize_text( embeddings_doc2vec, text_column, tokens_list ) return embeddings_doc2vec, text_vectors_df assert ( embeddings_doc2vec is not None ), "`embedding_doc2vec` argument must be not None." assert isinstance( embeddings_doc2vec, Doc2Vec ), "`embedding_doc2vec` argument must be an instance of Doc2Vec to infer vectors." text_vectors_df = self._vectorize_text( embeddings_doc2vec, text_column, tokens_list ) return text_vectors_df def convert_text_to_vectors_tf_idf( self, text_column: pd.Series, ngrams: int = 2, max_tokens: int = 10000, output_mode: str = "tf-idf", train: bool = True, text_vectorizer: TextVectorization = None, ) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor: """ Converting `text_column` to vectors using `TextVectorization` layer. Parameters: ------------ - text_column : pd.Series Contains the case facts. - ngrams : int, optional Defines the number of n-gram (Default is 2). - max_tokens : int, optional Defines the number of max_tokens of `text_vectorizer` (Default is 10,000). - output_mode : str, optional Represents the output vectors type whether it is "tfi-df" or "binary" or "count" (Default is "tf-idf"). - train : bool, optional Defines whether the model will be trained or not. (if True, TextVectorization will be trained, else, TextVectorization will used the passed `text_vectorizer`). (Default is True). - text_vectorizer : TextVectorization, optional Trained TextVectorization layer will be used for generating embeddings of `text_column` if `train` is False. (Default is None). Returns: -------- - if `train` == True: A tuple contains the following: - text_vectorizer : TextVectorization Trained TextVectorization layer. - text_vectors : tf.Tensor A Tensor contains `text_column` training vectors. - otherwise: text_vectors : tf.Tensor A Tensor contains `text_column` testing vectors. Raises: ------- - AssertionError If train is False and `text_vectorizer` is None. - AssertionError If train is False and `text_vectorizer` is not an instance of TextVectorization. """ if train: text_vectorizer = TextVectorization( ngrams=ngrams, max_tokens=max_tokens, output_mode=output_mode ) text_vectorizer.adapt(text_column) text_vectors = text_vectorizer(text_column) return text_vectorizer, text_vectors assert ( text_vectorizer is not None ), "`text_vectorizer` argument must be not None." assert isinstance( text_vectorizer, TextVectorization ), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors." text_vectors = text_vectorizer(text_column) return text_vectors def convert_text_to_vectors_cnn( self, text_column: pd.Series, max_tokens: int = 2000, output_sequence_length: int = 500, output_mode: str = "int", train: bool = True, text_vectorizer: TextVectorization = None, ) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor: """ Converting `text_column` to vectors using `TextVectorization` layer. Parameters: ------------ - text_column : pd.Series Contains the case facts. - max_tokens : int, optional Defines the number of max_tokens of `text_vectorizer` (Default is 2000). - output_sequence_length : int, optional Represents the dimensions of the output vector (Default is 500). - output_mode : str, optional Represents the output vectors type whether it is "int" or "binary" or "tfi-df". - train : bool, optional Defines whether the model will be trained or not. (if True, TextVectorization will be trained | else, TextVectorization will used the passed `text_vectorizer`). (Default is True). - text_vectorizer : TextVectorization, optional Trained TextVectorization layer will be used for generating embeddings of `text_column` if `train` is False. (Default is None). Returns: -------- - if `train` == True: A tuple contains the following: - text_vectorizer : TextVectorization Trained TextVectorization layer. - text_vectors : tf.Tensor A Tensor contains `text_column` training vectors. - otherwise: text_vectors : tf.Tensor A Tensor contains `text_column` testing vectors. Raises: ------- - AssertionError If train is False and `text_vectorizer` is None. - AssertionError If train is False and `text_vectorizer` is not an instance of TextVectorization. """ if train: text_vectorizer = TextVectorization( max_tokens=max_tokens, output_mode=output_mode, output_sequence_length=output_sequence_length, ) text_vectorizer.adapt(text_column) text_vectors = text_vectorizer(text_column) return text_vectorizer, text_vectors assert ( text_vectorizer is not None ), "`text_vectorizer` argument must be not None." assert isinstance( text_vectorizer, TextVectorization ), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors." text_vectors = text_vectorizer(text_column) return text_vectors def convert_text_to_vectors_glove( self, text_column: pd.Series, train: bool = True, glove_tokenizer: Tokenizer = None, vocab_size: int = 1000, oov_token: str = "", max_length: int = 50, padding_type: str = "post", truncation_type: str = "post", ) -> Tuple[Tokenizer, np.ndarray] | np.ndarray: """ Converting `text_column` to vectors using `glove_tokenizer`. Parameters: ------------ - text_column : pd.Series Contains the case facts. - train : bool, optional Defines whether the model will be trained or not. (if True, Tokenizer will be trained | else, Tokenizer will used the passed `glove_tokenizer`). (Default is True). - glove_tokenizer : Tokenizer, optional Trained Tokenizer layer will be used for generating embeddings of `text_column` if `train` is False. (Default is None). - vocab_size : int, optional Represents the number of supported vocabulary of the Tokenizer, any token not in this vocabulary will be treated as an out-of-vocabulary token(OOV). (Default is 1000). - oov_tokens : str, optional Represents the token of an out-of-vocabulary token (Default is ""). - max_length : int, optional Defins the output vector's dimension. (Default is 50). - padding_type : str, optional Defines the padding type of the vectors, if the vector size is less than `max_length`, the rest of the `max_length` will be padded with 0 (Default is "post"). - truncation_type : str, optional Defines the truncation type of the vectors, if the vector size is more than `max_length`, the extra of the `max_length` will be truncated (Default is "post"). Returns: -------- - if `train` == True: A tuple contains the following: - glove_tokenizer : Tokenizer Trained Tokenizer layer. - text_padded : np.ndarray An array contains `text_column` vectors. - otherwise: text_padded : np.ndarray An array contains `text_column` vectors. Raises: ------- - AssertionError If train is False and `glove_tokenizer` is None. - AssertionError If train is False and `glove_tokenizer` is not instance of Tokenizer. """ if train: glove_tokenizer = Tokenizer( num_words=vocab_size, oov_token=oov_token) glove_tokenizer.fit_on_texts(text_column) text_sequences = glove_tokenizer.texts_to_sequences(text_column) text_padded = pad_sequences( text_sequences, maxlen=max_length, padding=padding_type, truncating=truncation_type, ) return glove_tokenizer, text_padded assert ( glove_tokenizer is not None ), "`glove_tokenizer` argument must be not None." assert isinstance( glove_tokenizer, Tokenizer ), "`glove_tokenizer` argument must be an instance of Tokenizer." text_sequences = glove_tokenizer.texts_to_sequences(text_column) text_padded = pad_sequences( text_sequences, maxlen=max_length, padding=padding_type, truncating=truncation_type, ) return text_padded def balance_data(self, X_train: pd.Series, y_train: pd.Series) -> pd.DataFrame: """ Balancing `X_train` and `y_train` to distribute the targets in `y_train` equally. Parameters: ------------ - text_column : pd.Series Contains the case facts. - y_train : pd.Series Contains the training targets. Returns: -------- - shuffled_balanced_df : pd.DataFrame Contains the new balanced dataframe with shuffling indicies. """ df = pd.concat([X_train, y_train], axis=1) first_party = df[df["winner_index"] == 0] second_party = df[df["winner_index"] == 1] upsample_second_party = resample( second_party, replace=True, n_samples=len(first_party), random_state=42 ) upsample_df = pd.concat([upsample_second_party, first_party]) shuffled_indices = np.arange(upsample_df.shape[0]) np.random.shuffle(shuffled_indices) shuffled_balanced_df = upsample_df.iloc[shuffled_indices, :] return shuffled_balanced_df def anonymize_data( self, first_party_names: pd.Series, second_party_names: pd.Series, text_column: pd.Series, ) -> pd.Series: """ Anonymize `text_column` by replacing `first_party_names` and `second_party_names` wit "_PARTY_" tag. Parameters: ------------ - first_party_names : pd.Series Contains all first party names needed to be anonymized. - second_party_names : pd.Series Contains all second party names needed to be anonymized. - text_column : pd.Series Contains all texts needed to be anonymized. Returns: -------- - all_anonyimzed_facts : pd.Series Contains anonymized version of `text_column`. """ all_anonymized_facts = [] for i in range(text_column.shape[0]): facts = text_column.iloc[i] first_party_name = first_party_names.iloc[i] second_party_name = second_party_names.iloc[i] anonymized_facts = self._anonymize_case_facts( first_party_name, second_party_name, facts ) all_anonymized_facts.append(anonymized_facts) return pd.Series(all_anonymized_facts) def preprocess_data(self, text_column: pd.Series) -> pd.Series: """ Preprocessing & cleaning all texts in `text_column`. Parameters: ------------ - text_column : pd.Series Contains all case facts. Returns: -------- - preprocessed_text : pd.Series Contains all texts after being processed. """ preprocessed_text = text_column.apply(self._preprocess_text) return preprocessed_text