|
|
|
import string |
|
from typing import List, Tuple |
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
import re |
|
import nltk |
|
|
|
from sklearn.utils import resample |
|
|
|
from gensim.models.doc2vec import Doc2Vec, TaggedDocument |
|
from nltk.tokenize import RegexpTokenizer |
|
|
|
import tensorflow as tf |
|
from keras.layers import TextVectorization |
|
from keras.preprocessing.text import Tokenizer |
|
from keras.utils import pad_sequences |
|
|
|
|
|
from utils import Doc2VecModel |
|
|
|
|
|
punct = string.punctuation |
|
stemmer = nltk.stem.PorterStemmer() |
|
eng_stopwords = nltk.corpus.stopwords.words("english") |
|
|
|
|
|
class Preprocessor: |
|
"""Responsible for preprocessing case facts.""" |
|
|
|
def __init__(self) -> None: |
|
pass |
|
|
|
def _nltk_tokenizer(self, text: str) -> List[str]: |
|
""" |
|
Tokenize a given `text` using the RegexpTokenizer from the nltk library. |
|
|
|
Parameters: |
|
----------- |
|
- text : str |
|
A string containing the text to be tokenized. |
|
|
|
Returns: |
|
-------- |
|
- tokens : List[str] |
|
A list of tokens generated by the tokenizer. |
|
""" |
|
|
|
tokenizer = RegexpTokenizer(r"\w+") |
|
tokens = tokenizer.tokenize(text) |
|
|
|
return tokens |
|
|
|
def _tokenize_text(self, text_column: pd.Series) -> pd.Series: |
|
"""Splitting `text_column` into tokens. |
|
|
|
Parameters: |
|
------------ |
|
- text_column : pd.Series |
|
Contains text that needs to be tokenized. |
|
|
|
Returns: |
|
-------- |
|
- tokenized_text : pd.Series |
|
Contains tokenized version of `text_column`. |
|
""" |
|
|
|
tokenized_text = text_column.apply(self._nltk_tokenizer) |
|
return tokenized_text |
|
|
|
def _convert_to_tagged_document( |
|
self, text_column: pd.Series |
|
) -> Tuple[List[str], List[TaggedDocument]]: |
|
""" |
|
Convert `text_column` of specific to TaggedDocuments. |
|
|
|
Parameters: |
|
------------ |
|
- column : pd.Series |
|
Contains the list of tokens of each fact. |
|
|
|
Returns: |
|
-------- |
|
A tuble containing the following items: |
|
- tokens_list : list[str] |
|
Contains all tokens of each case in the `text_column`. |
|
- tagged_docs : list[TaggedDocument] |
|
Contains TaggedDocument object for each case. |
|
""" |
|
|
|
tokens_list = text_column.to_list() |
|
tagged_docs = [TaggedDocument(t, [str(i)]) |
|
for i, t in enumerate(tokens_list)] |
|
|
|
return tokens_list, tagged_docs |
|
|
|
def _vectorize_text( |
|
self, doc2vec_model: Doc2Vec, df: pd.Series, tokens_list: List[str] |
|
) -> pd.DataFrame: |
|
""" |
|
Convert values of `tokens_list` to a vector. |
|
|
|
Parameters: |
|
----------- |
|
- doc2vec_model : Doc2Vev |
|
Trained Doc2Vec model. |
|
- df : pd.Series |
|
This will use only to get its indicies for the new generated dataframe. |
|
- tokens_list : List[str] |
|
Contains all tokens of each case. |
|
|
|
Returns: |
|
-------- |
|
- text_vectors_df : pd.DataFrame |
|
Contains the vector representaion for each case. |
|
""" |
|
|
|
text_vectors = [doc2vec_model.infer_vector(doc) for doc in tokens_list] |
|
text_vectors_df = pd.DataFrame(text_vectors, index=df.index) |
|
|
|
return text_vectors_df |
|
|
|
def _anonymize_case_facts( |
|
self, first_party_name: str, second_party_name: str, facts: str |
|
) -> str: |
|
""" |
|
Anonymize case facts by replacing its party names with "_PARTY_" tag. |
|
|
|
Parameters: |
|
------------ |
|
- first_party_name : str |
|
Represents first party name or petitioner name. |
|
- second_party_name : str |
|
Represents second party name or respondent name. |
|
- facts : str |
|
Represents case facts. |
|
|
|
Returns: |
|
-------- |
|
- anonymized_facts : str |
|
An anonymized version of `facts`. |
|
""" |
|
|
|
|
|
first_party_name = re.sub(r"[\,+]", " ", first_party_name) |
|
first_party_name = re.sub(r"[^a-zA-Z]", " ", first_party_name) |
|
|
|
second_party_name = re.sub(r"[\,+]", " ", second_party_name) |
|
second_party_name = re.sub(r"[^a-zA-Z]", " ", second_party_name) |
|
|
|
for name in first_party_name.split(): |
|
facts = re.sub(name, " _PARTY_ ", facts) |
|
|
|
for name in second_party_name.split(): |
|
facts = re.sub(name, " _PARTY_ ", facts) |
|
|
|
|
|
regex_continous_tags = r"(_PARTY_\s+){2,}" |
|
anonymized_facts = re.sub(regex_continous_tags, " _PARTY_ ", facts) |
|
|
|
anonymized_facts = re.sub(r"\s+", " ", anonymized_facts) |
|
|
|
return anonymized_facts |
|
|
|
def _preprocess_text(self, text: str) -> str: |
|
""" |
|
Preprocessing & cleaning `text` including: |
|
- lowercasing |
|
- removing quotation marks |
|
- removing digits |
|
- removing punctuation |
|
- removing brackets, braces, and paranthesis |
|
- removeing stopwords |
|
- stemming tokens |
|
|
|
Parameters: |
|
------------ |
|
- text : str |
|
Text need to be processed (cleaned). |
|
|
|
Returns: |
|
-------- |
|
- processed_text : str |
|
A preprocessed version of `text`. |
|
""" |
|
|
|
text = text.lower() |
|
|
|
text = re.sub(r"\'", "", text) |
|
|
|
text = re.sub(r"\d+", "", text) |
|
|
|
text = "".join([ch for ch in text if (ch == "_") or (ch not in punct)]) |
|
|
|
text = re.sub(r"[\[\]\(\)\{\}]+", " ", text) |
|
tokens = nltk.word_tokenize(text) |
|
|
|
tokens = [stemmer.stem(token) |
|
for token in tokens if token not in eng_stopwords] |
|
|
|
processed_text = " ".join(tokens) |
|
|
|
return processed_text |
|
|
|
def convert_text_to_vectors_doc2vec( |
|
self, |
|
text_column: pd.Series, |
|
train: bool = True, |
|
embeddings_doc2vec: Doc2Vec = None, |
|
) -> Tuple[Doc2Vec, pd.DataFrame] | pd.DataFrame: |
|
""" |
|
Converting `text_column` to vectors using `Doc2Vec` model |
|
|
|
Parameters: |
|
------------ |
|
- text_column : pd.Series |
|
Contains the case facts. |
|
- train : bool, optional |
|
Defines whether the model will be trained or not. (if True, Doc2Vec will be trained | |
|
else, Doc2Vec will used the passed `embeddings_Doc2Vec`). (Default is True). |
|
- embeddings_doc2vec : Doc2Vec, optional |
|
Trained Doc2Vec model will be used for generating embeddings of `text_column` if |
|
`train` is False. (Default is None). |
|
|
|
Returns: |
|
-------- |
|
1. A tuple contains the following: |
|
- embeddings_doc2vec : Doc2Vec |
|
Trained Doc2Vec model. |
|
- text_vectors_df : pd.DataFrame |
|
A DataFrame contains `text_column` vectors if `train` is True. |
|
|
|
2. text_vectors_df : pd.DataFrame |
|
A DataFrame contains `text_column` vectors if `train` is False. |
|
|
|
Raises: |
|
------- |
|
- AssertionError |
|
If train is False and `embeddings_doc2vec` is None. |
|
- AssertionError |
|
If train is False and `embedding_doc2vec` is not an instance of Doc2Vec |
|
""" |
|
|
|
tokenized_text = self._tokenize_text(text_column) |
|
tokens_list, tagged_docs = self._convert_to_tagged_document( |
|
tokenized_text) |
|
|
|
if train: |
|
doc2vec_model = Doc2VecModel() |
|
embeddings_doc2vec = doc2vec_model.train_doc2vec_embeddings_model( |
|
tagged_docs |
|
) |
|
text_vectors_df = self._vectorize_text( |
|
embeddings_doc2vec, text_column, tokens_list |
|
) |
|
return embeddings_doc2vec, text_vectors_df |
|
|
|
assert ( |
|
embeddings_doc2vec is not None |
|
), "`embedding_doc2vec` argument must be not None." |
|
assert isinstance( |
|
embeddings_doc2vec, Doc2Vec |
|
), "`embedding_doc2vec` argument must be an instance of Doc2Vec to infer vectors." |
|
text_vectors_df = self._vectorize_text( |
|
embeddings_doc2vec, text_column, tokens_list |
|
) |
|
|
|
return text_vectors_df |
|
|
|
def convert_text_to_vectors_tf_idf( |
|
self, |
|
text_column: pd.Series, |
|
ngrams: int = 2, |
|
max_tokens: int = 10000, |
|
output_mode: str = "tf-idf", |
|
train: bool = True, |
|
text_vectorizer: TextVectorization = None, |
|
) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor: |
|
""" |
|
Converting `text_column` to vectors using `TextVectorization` layer. |
|
|
|
Parameters: |
|
------------ |
|
- text_column : pd.Series |
|
Contains the case facts. |
|
- ngrams : int, optional |
|
Defines the number of n-gram (Default is 2). |
|
- max_tokens : int, optional |
|
Defines the number of max_tokens of `text_vectorizer` (Default is 10,000). |
|
- output_mode : str, optional |
|
Represents the output vectors type whether it is "tfi-df" or "binary" or "count" |
|
(Default is "tf-idf"). |
|
- train : bool, optional |
|
Defines whether the model will be trained or not. (if True, TextVectorization |
|
will be trained, else, TextVectorization will used the passed `text_vectorizer`). |
|
(Default is True). |
|
- text_vectorizer : TextVectorization, optional |
|
Trained TextVectorization layer will be used for generating embeddings of |
|
`text_column` if `train` is False. (Default is None). |
|
|
|
Returns: |
|
-------- |
|
- if `train` == True: |
|
A tuple contains the following: |
|
- text_vectorizer : TextVectorization |
|
Trained TextVectorization layer. |
|
- text_vectors : tf.Tensor |
|
A Tensor contains `text_column` training vectors. |
|
- otherwise: |
|
text_vectors : tf.Tensor |
|
A Tensor contains `text_column` testing vectors. |
|
|
|
Raises: |
|
------- |
|
- AssertionError |
|
If train is False and `text_vectorizer` is None. |
|
- AssertionError |
|
If train is False and `text_vectorizer` is not an instance of TextVectorization. |
|
""" |
|
|
|
if train: |
|
text_vectorizer = TextVectorization( |
|
ngrams=ngrams, max_tokens=max_tokens, output_mode=output_mode |
|
) |
|
text_vectorizer.adapt(text_column) |
|
text_vectors = text_vectorizer(text_column) |
|
|
|
return text_vectorizer, text_vectors |
|
|
|
assert ( |
|
text_vectorizer is not None |
|
), "`text_vectorizer` argument must be not None." |
|
assert isinstance( |
|
text_vectorizer, TextVectorization |
|
), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors." |
|
text_vectors = text_vectorizer(text_column) |
|
|
|
return text_vectors |
|
|
|
def convert_text_to_vectors_cnn( |
|
self, |
|
text_column: pd.Series, |
|
max_tokens: int = 2000, |
|
output_sequence_length: int = 500, |
|
output_mode: str = "int", |
|
train: bool = True, |
|
text_vectorizer: TextVectorization = None, |
|
) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor: |
|
""" |
|
Converting `text_column` to vectors using `TextVectorization` layer. |
|
|
|
Parameters: |
|
------------ |
|
- text_column : pd.Series |
|
Contains the case facts. |
|
- max_tokens : int, optional |
|
Defines the number of max_tokens of `text_vectorizer` (Default is 2000). |
|
- output_sequence_length : int, optional |
|
Represents the dimensions of the output vector (Default is 500). |
|
- output_mode : str, optional |
|
Represents the output vectors type whether it is "int" or "binary" or "tfi-df". |
|
- train : bool, optional |
|
Defines whether the model will be trained or not. (if True, |
|
TextVectorization will be trained | else, TextVectorization will used the |
|
passed `text_vectorizer`). (Default is True). |
|
- text_vectorizer : TextVectorization, optional |
|
Trained TextVectorization layer will be used for generating embeddings of |
|
`text_column` if `train` is False. (Default is None). |
|
|
|
Returns: |
|
-------- |
|
- if `train` == True: |
|
A tuple contains the following: |
|
- text_vectorizer : TextVectorization |
|
Trained TextVectorization layer. |
|
- text_vectors : tf.Tensor |
|
A Tensor contains `text_column` training vectors. |
|
- otherwise: |
|
text_vectors : tf.Tensor |
|
A Tensor contains `text_column` testing vectors. |
|
|
|
Raises: |
|
------- |
|
- AssertionError |
|
If train is False and `text_vectorizer` is None. |
|
- AssertionError |
|
If train is False and `text_vectorizer` is not an instance of TextVectorization. |
|
""" |
|
|
|
if train: |
|
text_vectorizer = TextVectorization( |
|
max_tokens=max_tokens, |
|
output_mode=output_mode, |
|
output_sequence_length=output_sequence_length, |
|
) |
|
text_vectorizer.adapt(text_column) |
|
text_vectors = text_vectorizer(text_column) |
|
return text_vectorizer, text_vectors |
|
|
|
assert ( |
|
text_vectorizer is not None |
|
), "`text_vectorizer` argument must be not None." |
|
assert isinstance( |
|
text_vectorizer, TextVectorization |
|
), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors." |
|
text_vectors = text_vectorizer(text_column) |
|
|
|
return text_vectors |
|
|
|
def convert_text_to_vectors_glove( |
|
self, |
|
text_column: pd.Series, |
|
train: bool = True, |
|
glove_tokenizer: Tokenizer = None, |
|
vocab_size: int = 1000, |
|
oov_token: str = "<OOV>", |
|
max_length: int = 50, |
|
padding_type: str = "post", |
|
truncation_type: str = "post", |
|
) -> Tuple[Tokenizer, np.ndarray] | np.ndarray: |
|
""" |
|
Converting `text_column` to vectors using `glove_tokenizer`. |
|
|
|
Parameters: |
|
------------ |
|
- text_column : pd.Series |
|
Contains the case facts. |
|
- train : bool, optional |
|
Defines whether the model will be trained or not. (if True, |
|
Tokenizer will be trained | else, Tokenizer will used the |
|
passed `glove_tokenizer`). (Default is True). |
|
- glove_tokenizer : Tokenizer, optional |
|
Trained Tokenizer layer will be used for generating embeddings of |
|
`text_column` if `train` is False. (Default is None). |
|
- vocab_size : int, optional |
|
Represents the number of supported vocabulary of the Tokenizer, |
|
any token not in this vocabulary will be treated as an out-of-vocabulary |
|
token(OOV). (Default is 1000). |
|
- oov_tokens : str, optional |
|
Represents the token of an out-of-vocabulary token (Default is "<OOV>"). |
|
- max_length : int, optional |
|
Defins the output vector's dimension. (Default is 50). |
|
- padding_type : str, optional |
|
Defines the padding type of the vectors, if the vector size is less than |
|
`max_length`, the rest of the `max_length` will be padded with 0 (Default is "post"). |
|
- truncation_type : str, optional |
|
Defines the truncation type of the vectors, if the vector size is more than |
|
`max_length`, the extra of the `max_length` will be truncated (Default is "post"). |
|
|
|
Returns: |
|
-------- |
|
- if `train` == True: |
|
A tuple contains the following: |
|
- glove_tokenizer : Tokenizer |
|
Trained Tokenizer layer. |
|
- text_padded : np.ndarray |
|
An array contains `text_column` vectors. |
|
- otherwise: |
|
text_padded : np.ndarray |
|
An array contains `text_column` vectors. |
|
|
|
Raises: |
|
------- |
|
- AssertionError |
|
If train is False and `glove_tokenizer` is None. |
|
- AssertionError |
|
If train is False and `glove_tokenizer` is not instance of Tokenizer. |
|
""" |
|
|
|
if train: |
|
glove_tokenizer = Tokenizer( |
|
num_words=vocab_size, oov_token=oov_token) |
|
glove_tokenizer.fit_on_texts(text_column) |
|
text_sequences = glove_tokenizer.texts_to_sequences(text_column) |
|
text_padded = pad_sequences( |
|
text_sequences, |
|
maxlen=max_length, |
|
padding=padding_type, |
|
truncating=truncation_type, |
|
) |
|
|
|
return glove_tokenizer, text_padded |
|
|
|
assert ( |
|
glove_tokenizer is not None |
|
), "`glove_tokenizer` argument must be not None." |
|
assert isinstance( |
|
glove_tokenizer, Tokenizer |
|
), "`glove_tokenizer` argument must be an instance of Tokenizer." |
|
text_sequences = glove_tokenizer.texts_to_sequences(text_column) |
|
text_padded = pad_sequences( |
|
text_sequences, |
|
maxlen=max_length, |
|
padding=padding_type, |
|
truncating=truncation_type, |
|
) |
|
|
|
return text_padded |
|
|
|
def balance_data(self, X_train: pd.Series, y_train: pd.Series) -> pd.DataFrame: |
|
""" |
|
Balancing `X_train` and `y_train` to distribute the targets in `y_train` equally. |
|
|
|
Parameters: |
|
------------ |
|
- text_column : pd.Series |
|
Contains the case facts. |
|
- y_train : pd.Series |
|
Contains the training targets. |
|
|
|
Returns: |
|
-------- |
|
- shuffled_balanced_df : pd.DataFrame |
|
Contains the new balanced dataframe with shuffling indicies. |
|
""" |
|
|
|
df = pd.concat([X_train, y_train], axis=1) |
|
|
|
first_party = df[df["winner_index"] == 0] |
|
second_party = df[df["winner_index"] == 1] |
|
|
|
upsample_second_party = resample( |
|
second_party, replace=True, n_samples=len(first_party), random_state=42 |
|
) |
|
|
|
upsample_df = pd.concat([upsample_second_party, first_party]) |
|
|
|
shuffled_indices = np.arange(upsample_df.shape[0]) |
|
np.random.shuffle(shuffled_indices) |
|
|
|
shuffled_balanced_df = upsample_df.iloc[shuffled_indices, :] |
|
|
|
return shuffled_balanced_df |
|
|
|
def anonymize_data( |
|
self, |
|
first_party_names: pd.Series, |
|
second_party_names: pd.Series, |
|
text_column: pd.Series, |
|
) -> pd.Series: |
|
""" |
|
Anonymize `text_column` by replacing `first_party_names` and |
|
`second_party_names` wit "_PARTY_" tag. |
|
|
|
Parameters: |
|
------------ |
|
- first_party_names : pd.Series |
|
Contains all first party names needed to be anonymized. |
|
- second_party_names : pd.Series |
|
Contains all second party names needed to be anonymized. |
|
- text_column : pd.Series |
|
Contains all texts needed to be anonymized. |
|
|
|
Returns: |
|
-------- |
|
- all_anonyimzed_facts : pd.Series |
|
Contains anonymized version of `text_column`. |
|
""" |
|
|
|
all_anonymized_facts = [] |
|
|
|
for i in range(text_column.shape[0]): |
|
facts = text_column.iloc[i] |
|
first_party_name = first_party_names.iloc[i] |
|
second_party_name = second_party_names.iloc[i] |
|
anonymized_facts = self._anonymize_case_facts( |
|
first_party_name, second_party_name, facts |
|
) |
|
all_anonymized_facts.append(anonymized_facts) |
|
|
|
return pd.Series(all_anonymized_facts) |
|
|
|
def preprocess_data(self, text_column: pd.Series) -> pd.Series: |
|
""" |
|
Preprocessing & cleaning all texts in `text_column`. |
|
|
|
Parameters: |
|
------------ |
|
- text_column : pd.Series |
|
Contains all case facts. |
|
|
|
Returns: |
|
-------- |
|
- preprocessed_text : pd.Series |
|
Contains all texts after being processed. |
|
""" |
|
|
|
preprocessed_text = text_column.apply(self._preprocess_text) |
|
return preprocessed_text |
|
|