Spaces:

Hetan07
/

trial

Runtime error

App Files Files Community

Hetan07 commited on Oct 27, 2023

Commit

674a23c

•

1 Parent(s): b75acff

Upload 5 files

Browse files

Files changed (5) hide show

deployment_utils.py +607 -0
plotting.py +230 -0
preprocessing.py +591 -0
style.css +94 -0
utils.py +389 -0

deployment_utils.py ADDED Viewed

	@@ -0,0 +1,607 @@

+# global
+from typing import Tuple, List
+import re
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from tensorflow import keras
+from keras.utils import pad_sequences
+from keras.preprocessing.text import Tokenizer
+from gensim.models.doc2vec import Doc2Vec
+import transformers
+from transformers import pipeline, BertTokenizer
+import fasttext
+# local
+from preprocessing import Preprocessor
+from utils import read_data
+# read data
+X_train, X_test, y_train, y_test = read_data()
+# instantiate preprocessor object
+preprocessor = Preprocessor()
+# load models
+doc2vec_model_embeddings = Doc2Vec.load(
+    "F:/Graduation Project/Project/models/best_doc2vec_embeddings")
+doc2vec_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_doc2vec_model.h5")
+tfidf_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_tfidf_model.h5")
+cnn_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_cnn_model.h5")
+glove_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_glove_model.h5")
+lstm_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_lstm_model.h5")
+bert_model = keras.models.load_model(
+    "F:/Graduation Project/Project/models/best_bert_model.h5", custom_objects={"TFBertModel": transformers.TFBertModel})
+fasttext_model = fasttext.load_model(
+    "F:/Graduation Project/Project/models/best_fasttext_model.bin")
+summarization_model = pipeline(
+    "summarization", model="facebook/bart-large-cnn")
+# TODO: Add Docstrings
+def extract_case_information(case_content: str):
+    content_list = case_content.split("\n")
+    petitioner = re.findall(r"petitioner:(.+)", content_list[0])[0]
+    respondent = re.findall(r"respondent:(.+)", content_list[1])[0]
+    facts = re.findall(r"facts:(.+)", content_list[2])[0]
+    return petitioner, respondent, facts
+def generate_random_sample() -> Tuple[str, str, str, int]:
+    """
+    Randomly fetch a random case from `X_test` to test it.
+    Returns:
+    --------
+    A tuple contains the following:
+        - petitioner : str
+            Contains petitioner name.
+        - respondent : str
+            Contains respondent name.
+        - facts : str
+            Contains case facts.
+        - label : int
+            Represents the winning index(0 = petitioner, 1 = respondent).
+    """
+    random_idx = np.random.randint(low=0, high=len(X_test))
+    petitioner = X_test["first_party"].iloc[random_idx]
+    respondent = X_test["second_party"].iloc[random_idx]
+    facts = X_test["Facts"].iloc[random_idx]
+    label = y_test.iloc[random_idx][0]
+    return petitioner, respondent, facts, label
+def generate_highlighted_words(facts: str, petitioner_words: List[str], respondent_words: List[str]):
+    """
+    Highlight `petitioner_words` and `respondent_words` for model
+    interpretation.
+    Parameters:
+    -----------
+        - facts : str
+            Facts of a specific case.
+        - petitioner_words : List[str]
+            Contains all words that model pays attention
+            to be a petetioner words.
+        - respondent_words : List[str]
+            Contains all words that model pays attention
+            to be a respondent words.
+    Returns:
+    --------
+        - rendered_text : str
+            Contains the `facts` but with adding
+            highlighting mechanism to visualize it using CSS in HTML format.
+    Example:
+    --------
+        >>> facts_ = 'Mohammed shot Aly after a hot negotiation happened  between
+        ... them about the profits of their company'
+        >>> petitioner_words_ = ['shot', 'hot']
+        >>> respondent_words_ = ['profits']
+        >>> generate_highlighted_words(facts, petitioner_words_, respondent_words_)
+        >>> output:
+        <div class='text-facts'> Mohammed <span class='highlight-petitioner'>shot</span>
+        Aly after a <span class='highlight-petitioner'>hot</span> negotiation happened
+        between them about <span class='highlight-respondent'>profits</span> of their
+        company </div>
+    """
+    rendered_text = '<div class="text-facts"> '
+    for word in facts.split():
+        if word in petitioner_words:
+            highlight_word = ' <span class="highlight-petitioner"> ' + word + " </span> "
+            rendered_text += highlight_word
+        elif word in respondent_words:
+            highlight_word = ' <span class="highlight-respondent"> ' + word + " </span> "
+            rendered_text += highlight_word
+        else:
+            rendered_text += " " + word
+    rendered_text += " </div>"
+    return rendered_text
+class VectorizerGenerator:
+    """Responsible for creation and generation of tokenizers and text
+    vectorizers for JudgerAIs' models"""
+    def __init__(self) -> None:
+        pass
+    def generate_tf_idf_vectorizer(self) -> keras.layers.TextVectorization:
+        """
+        Generating best text vectroizer of the tf-idf model (3rd combination).
+        Returns:
+        -------
+        - text_vectorizer : keras.layers.TextVectorization
+            Represents the case facts' vectorizer that converts case facts to
+            numerical tensors.
+        """
+        first_party_names = X_train["first_party"]
+        second_party_names = X_train["second_party"]
+        facts = X_train["Facts"]
+        anonymized_facts = preprocessor.anonymize_data(
+            first_party_names, second_party_names, facts)
+        text_vectorizer, _ = preprocessor.convert_text_to_vectors_tf_idf(
+            anonymized_facts)
+        return text_vectorizer
+    def generate_cnn_vectorizer(self) -> keras.layers.TextVectorization:
+        """
+        Generating best text vectroizer of the cnn model (2nd combination).
+        Returns:
+        -------
+        - text_vectorizer : keras.layers.TextVectorization
+            Represents the case facts' vectorizer that converts case facts to
+            numerical tensors.
+        """
+        balanced_df = preprocessor.balance_data(X_train["Facts"], y_train)
+        X_train_balanced = balanced_df["Facts"]
+        text_vectorizer, _ = preprocessor.convert_text_to_vectors_cnn(
+            X_train_balanced)
+        return text_vectorizer
+    def generate_glove_tokenizer(self) -> keras.preprocessing.text.Tokenizer:
+        """
+        Generating best glove tokenizer of the GloVe model (2nd combination).
+        Returns:
+        -------
+        - glove_tokenizer : keras.preprocessing.text.Tokenizer
+            Represents the case facts' tokenizer that converts case facts to
+            numerical tensors.
+        """
+        balanced_df = preprocessor.balance_data(X_train["Facts"], y_train)
+        X_train_balanced = balanced_df["Facts"]
+        glove_tokenizer, _ = preprocessor.convert_text_to_vectors_glove(
+            X_train_balanced)
+        return glove_tokenizer
+    def generate_lstm_tokenizer(self) -> keras.preprocessing.text.Tokenizer:
+        """
+        Generating best text tokenizer of the LSTM model (1st combination).
+        Returns:
+        -------
+        - lstm_tokenizer : keras.preprocessing.text.Tokenizer
+            Represents the case facts' tokenizer that converts case facts to
+            numerical tensors.
+        """
+        lstm_tokenizer = Tokenizer(num_words=18430)
+        lstm_tokenizer.fit_on_texts(X_train)
+        return lstm_tokenizer
+    def generate_bert_tokenizer(self) -> transformers.BertTokenizer:
+        """
+        Generating best bert tokenizer of the BERT model (1st combination).
+        Returns:
+        -------
+        - bert_tokenizer : transformers.BertTokenizer
+            Represents the case facts' tokenizer that converts case facts to
+            input ids tensors.
+        """
+        bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        return bert_tokenizer
+class DataPreparator:
+    """Responsible for preparing the case facts aka converting case facts to
+    numerical vectors using `VectorizerGenerator` object."""
+    def __init__(self) -> None:
+        self.vectorizer_generator = VectorizerGenerator()
+    def prepare_doc2vec(self, facts: str) -> pd.DataFrame:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using `doc2vec_model_embeddings`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - facts_vector : pd.DataFrame
+            A row DataFrame represents the 50-d vector of the `facts`.
+        """
+        facts = pd.Series(facts)
+        facts_processed = preprocessor.preprocess_data(facts)
+        facts_vectors = preprocessor.convert_text_to_vectors_doc2vec(
+            facts_processed, train=False, embeddings_doc2vec=doc2vec_model_embeddings)
+        return facts_vectors
+    def _anonymize_facts(self, first_party_name: str, second_party_name: str, facts: str) -> str:
+        """
+        Anonymize case `facts` by replacing `first_party_name` & `second_party_name` with
+        generic tag "__PARTY__".
+        Parameters:
+        -----------
+        - first_party_name : str
+            Represents the petitioner name.
+        - second_party_name : str
+            Represents the respondent name.
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - anonymized_facts : str
+            Represents `facts` after anonymization.
+        """
+        anonymized_facts = preprocessor._anonymize_case_facts(
+            first_party_name, second_party_name, facts)
+        return anonymized_facts
+    def prepare_tf_idf(self, anonymized_facts: str) -> tf.Tensor:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using tf-idf `vectorizer_generator` in the 3rd combination.
+        Parameters:
+        -----------
+        - anonymized_facts : str
+            Represents the case facts after anonymization.
+        Returns:
+        -------
+        - facts_vector : tf.Tensor
+            A Tensor of 10000-d represents `facts`.
+        """
+        anonymized_facts = pd.Series(anonymized_facts)
+        tf_idf_vectorizer = self.vectorizer_generator.generate_tf_idf_vectorizer()
+        facts_vector = preprocessor.convert_text_to_vectors_tf_idf(
+            anonymized_facts, train=False, text_vectorizer=tf_idf_vectorizer)
+        return facts_vector
+    def prepare_cnn(self, facts: str) -> tf.Tensor:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using cnn `vectorizer_generator` in the 2nd combination.
+        Parameters:
+        -----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - facts_vector : tf.Tensor
+            A Tensor of 2000-d represents `facts`.
+        """
+        facts = pd.Series(facts)
+        cnn_vectorizer = self.vectorizer_generator.generate_cnn_vectorizer()
+        facts_vector = preprocessor.convert_text_to_vectors_cnn(
+            facts, train=False, text_vectorizer=cnn_vectorizer)
+        return facts_vector
+    def prepare_glove(self, facts: str) -> np.ndarray:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using glove `vectorizer_generator` in the 2nd combination.
+        Parameters:
+        -----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - facts_vector : np.ndarray
+            A nd.ndarray of 50-d represents `facts`.
+        """
+        facts = pd.Series(facts)
+        glove_tokneizer = self.vectorizer_generator.generate_glove_tokenizer()
+        facts_vector = preprocessor.convert_text_to_vectors_glove(
+            facts, train=False, glove_tokenizer=glove_tokneizer)
+        return facts_vector
+    def prepare_lstm(self, facts: str) -> np.ndarray:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using lstm `vectorizer_generator` in the 1st combination.
+        Parameters:
+        -----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - facts_vector_padded : np.ndarray
+            A nd.ndarray of 974-d represents `facts`.
+        """
+        facts = pd.Series(facts)
+        lstm_tokenizer = self.vectorizer_generator.generate_lstm_tokenizer()
+        facts_vector = lstm_tokenizer.texts_to_sequences(facts)
+        facts_vector_padded = pad_sequences(facts_vector, 974)
+        return facts_vector_padded
+    def prepare_bert(self, facts: str) -> tf.Tensor:
+        """
+        Responsible for converting `facts` string to numerical vector
+        using bert `vectorizer_generator` in the 1st combination.
+        Parameters:
+        -----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        -------
+        - tf.Tensor
+            A tf.Tensor of 256-d represents `facts` input ids.
+        """
+        bert_tokenizer = self.vectorizer_generator.generate_bert_tokenizer()
+        facts_vector_dict = bert_tokenizer.encode_plus(
+            facts,
+            max_length=256,
+            truncation=True,
+            padding='max_length',
+            add_special_tokens=True,
+            return_tensors='tf'
+        )
+        return facts_vector_dict["input_ids"]
+class Predictor:
+    """Responsible for get predictions of JudgerAIs' models"""
+    def __init__(self) -> None:
+        self.data_preparator = DataPreparator()
+    def predict_doc2vec(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `doc2vec_model`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_doc2vec(facts)
+        predictions = doc2vec_model.predict(facts_vector)
+        pet_res_scores = []
+        for i in predictions:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def predict_tf_idf(self, anonymized_facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `tfidf_model`.
+        Parameters:
+        -----------
+        - anonymized_facts : str
+            Represents the case facts after anonymization.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_tf_idf(anonymized_facts)
+        predictions = tfidf_model.predict(facts_vector)
+        pet_res_scores = []
+        for i in predictions:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def predict_cnn(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `cnn_model`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_cnn(facts)
+        predictions = cnn_model.predict(facts_vector)
+        pet_res_scores = []
+        for i in predictions:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def predict_glove(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `glove_model`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_glove(facts)
+        predictions = glove_model.predict(facts_vector)
+        pet_res_scores = []
+        for i in predictions:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def predict_lstm(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `lstm_model`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_lstm(facts)
+        predictions = lstm_model.predict(facts_vector)
+        pet_res_scores = []
+        for i in predictions:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def predict_bert(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `bert_model`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - predictions : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        facts_vector = self.data_preparator.prepare_bert(facts)
+        predictions = bert_model.predict(facts_vector)
+        return predictions
+    def predict_fasttext(self, facts: str) -> np.ndarray:
+        """
+        Get prediction of `facts` using `fasttext`.
+        Parameters:
+        ----------
+        - facts : str
+            Represents the case facts.
+        Returns:
+        --------
+        - pet_res_scores : np.ndarray
+            An array contains 2 elements, one for probability of petitioner winning
+            and the second for the probability of respondent winning.
+        """
+        prediction = fasttext_model.predict(facts)[1]
+        prediction = np.array([prediction])
+        pet_res_scores = []
+        for i in prediction:
+            temp = i[0]
+            pet_res_scores.append(np.array([1 - temp, temp]))
+        return np.array(pet_res_scores)
+    def summarize_facts(self, facts: str) -> str:
+        summarized_case_facts = summarization_model(facts)[0]['summary_text']
+        return summarized_case_facts

plotting.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from typing import List
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sn
+from sklearn.metrics import auc
+from sklearn.metrics import roc_curve
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+from tensorflow import keras
+class PlottingManager:
+    """Responsible for providing plots & visualization for the models."""
+    def __init__(self) -> None:
+        """Define style for visualizations."""
+        plt.style.use("seaborn")
+    def plot_subplots_curve(
+        self,
+        training_measure: List[List[float]],
+        validation_measure: List[List[float]],
+        title: str,
+        train_color: str = "orangered",
+        validation_color: str = "dodgerblue",
+    ) -> None:
+        """
+        Plotting subplots of the elements of `training_measure` vs. `validation_measure`.
+        Parameters:
+        ------------
+        - training_measure : List[List[float]]
+            A `k` by `num_epochs` list contains the trained measure whether it's loss or
+            accuracy for each fold.
+        - validation_measure : List[List[float]]
+            A `k` by `num_epochs` list contains the validation measure whether it's loss
+            or accuracy for each fold.
+        - title : str
+            Represents the title of the plot.
+        - train_color : str, optional
+            Represents the graph color for the `training_measure`. (Default is "orangered").
+        - validation_color : str, optional
+            Represents the graph color for the `validation_measure`. (Default is "dodgerblue").
+        """
+        plt.figure(figsize=(12, 8))
+        for i in range(len(training_measure)):
+            plt.subplot(2, 2, i + 1)
+            plt.plot(training_measure[i], c=train_color)
+            plt.plot(validation_measure[i], c=validation_color)
+            plt.title("Fold " + str(i + 1))
+        plt.suptitle(title)
+        plt.show()
+    def plot_heatmap(
+        self, measure: List[List[float]], title: str, cmap: str = "coolwarm"
+    ) -> None:
+        """
+        Plotting a heatmap of the values in `measure`.
+        Parameters:
+        ------------
+        - measure : List[List[float]]
+            A `k` by `num_epochs` list contains the measure whether it's loss
+            or accuracy for each fold.
+        - title : str
+            Title of the plot.
+        - cmap : str, optional
+            Color map of the plot (default is "coolwarm").
+        """
+        # transpose the array to make it `num_epochs` by `k`
+        values_array = np.array(measure).T
+        df_cm = pd.DataFrame(
+            values_array,
+            range(1, values_array.shape[0] + 1),
+            ["fold " + str(i + 1) for i in range(4)],
+        )
+        plt.figure(figsize=(10, 8))
+        plt.title(
+            title + " Throughout " + str(values_array.shape[1]) + " Folds", pad=20
+        )
+        sn.heatmap(df_cm, annot=True, cmap=cmap, annot_kws={"size": 10})
+        plt.show()
+    def plot_average_curves(
+        self,
+        title: str,
+        x: List[float],
+        y: List[float],
+        x_label: str,
+        y_label: str,
+        train_color: str = "orangered",
+        validation_color: str = "dodgerblue",
+    ) -> None:
+        """
+        Plotting the curves of `x` against `y`, where x and y are training and validation
+        measures (loss or accuracy).
+        Parameters:
+        ------------
+        - title : str
+            Title of the plot.
+        - x : List[float]
+            Training measure of the models (loss or accuracy).
+        - y : List[float]
+            Validation measure of the models (loss or accuracy).
+        - x_label : str
+            Label of the training measure to put it in plot legend.
+        - y_label : str
+            Label of the validation measure to put it in plot legend.
+        - train_color : str, optional
+            Color of the training plot (default is "orangered").
+        - validation_color : str, optional
+            Color of the validation plot (default is "dodgerblue").
+        """
+        plt.title(title, pad=20)
+        plt.plot(x, c=train_color, label=x_label)
+        plt.plot(y, c=validation_color, label=y_label)
+        plt.legend()
+        plt.show()
+    def plot_roc_curve(
+        self,
+        all_models: List[keras.models.Sequential],
+        X_test: pd.DataFrame,
+        y_test: pd.Series,
+    ) -> None:
+        """
+        Plotting the AUC-ROC curve of all the passed models in `all_models`.
+        Parameters:
+        ------------
+        - all_models : List[keras.models.Sequential]
+            Contains all trained models, number of models equals number of
+             `k` fold cross-validation.
+        - X_test : pd.DataFrame
+            Contains the testing vectors.
+        - y_test : pd.Series
+            Contains the testing labels.
+        """
+        plt.figure(figsize=(12, 8))
+        for i, model in enumerate(all_models):
+            y_pred = model.predict(X_test).ravel()
+            fpr, tpr, _ = roc_curve(y_test, y_pred)
+            auc_curve = auc(fpr, tpr)
+            plt.subplot(2, 2, i + 1)
+            plt.plot([0, 1], [0, 1], color="dodgerblue", linestyle="--")
+            plt.plot(
+                fpr,
+                tpr,
+                color="orangered",
+                label=f"Fold {str(i+1)} (area = {auc_curve:.3f})",
+            )
+            plt.legend(loc="best")
+            plt.title(f"Fold {str(i+1)}")
+        plt.suptitle("AUC-ROC curves")
+        plt.show()
+    def plot_classification_report(
+        self, model: keras.models.Sequential, X_test: pd.DataFrame, y_test: pd.Series
+    ) -> str | dict:
+        """
+        Plotting the classification report of the passed `model`.
+        Parameters:
+        ------------
+        - model : keras.models.Sequential
+            The trained model that will be evaluated.
+        - X_test : pd.DataFrame
+            Contains the testing vectors.
+        - y_test : pd.Series
+            Contains the testing labels.
+        Returns:
+        --------
+        - str | dict: The classification report for the given model and testing data.
+            It returns a string if `output_format` is set to 'str', and returns
+            a dictionary if `output_format` is set to 'dict'.
+        """
+        y_pred = model.predict(X_test).ravel()
+        preds = np.where(y_pred > 0.5, 1, 0)
+        cls_report = classification_report(y_test, preds)
+        return cls_report
+    def plot_confusion_matrix(
+        self,
+        all_models: List[keras.models.Sequential],
+        X_test: pd.DataFrame,
+        y_test: pd.Series,
+    ) -> None:
+        """
+        Plotting the confusion matrix of each model in `all_models`.
+        Parameters:
+        ------------
+        - all_models: list[keras.models.Sequential]
+            Contains all trained models, number of models equals
+            number of `k` fold cross-validation.
+        - X_test: pd.DataFrame
+            Contains the testing vectors.
+        - y_test: pd.Series
+            Contains the testing labels.
+        """
+        _, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
+        for i, (model, ax) in enumerate(zip(all_models, axes.flatten())):
+            y_pred = model.predict(X_test).ravel()
+            preds = np.where(y_pred > 0.5, 1, 0)
+            conf_matrix = confusion_matrix(y_test, preds)
+            sn.heatmap(conf_matrix, annot=True, ax=ax)
+            ax.set_title(f"Fold {i+1}")
+        plt.suptitle("Confusion Matrices")
+        plt.tight_layout()
+        plt.show()

preprocessing.py ADDED Viewed

	@@ -0,0 +1,591 @@

+# global
+import string
+from typing import List, Tuple
+import numpy as np
+import pandas as pd
+import re
+import nltk
+from sklearn.utils import resample
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+from nltk.tokenize import RegexpTokenizer
+import tensorflow as tf
+from keras.layers import TextVectorization
+from keras.preprocessing.text import Tokenizer
+from keras.utils import pad_sequences
+# local
+from utils import Doc2VecModel
+punct = string.punctuation
+stemmer = nltk.stem.PorterStemmer()
+eng_stopwords = nltk.corpus.stopwords.words("english")
+class Preprocessor:
+    """Responsible for preprocessing case facts."""
+    def __init__(self) -> None:
+        pass
+    def _nltk_tokenizer(self, text: str) -> List[str]:
+        """
+        Tokenize a given `text` using the RegexpTokenizer from the nltk library.
+        Parameters:
+        -----------
+        - text : str
+            A string containing the text to be tokenized.
+        Returns:
+        --------
+        - tokens : List[str]
+            A list of tokens generated by the tokenizer.
+        """
+        tokenizer = RegexpTokenizer(r"\w+")
+        tokens = tokenizer.tokenize(text)
+        return tokens
+    def _tokenize_text(self, text_column: pd.Series) -> pd.Series:
+        """Splitting `text_column` into tokens.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains text that needs to be tokenized.
+        Returns:
+        --------
+        - tokenized_text : pd.Series
+            Contains tokenized version of `text_column`.
+        """
+        tokenized_text = text_column.apply(self._nltk_tokenizer)
+        return tokenized_text
+    def _convert_to_tagged_document(
+        self, text_column: pd.Series
+    ) -> Tuple[List[str], List[TaggedDocument]]:
+        """
+        Convert `text_column` of specific to TaggedDocuments.
+        Parameters:
+        ------------
+        - column : pd.Series
+            Contains the list of tokens of each fact.
+        Returns:
+        --------
+        A tuble containing the following items:
+            - tokens_list : list[str]
+                Contains all tokens of each case in the `text_column`.
+            - tagged_docs : list[TaggedDocument]
+                Contains TaggedDocument object for each case.
+        """
+        tokens_list = text_column.to_list()
+        tagged_docs = [TaggedDocument(t, [str(i)])
+                       for i, t in enumerate(tokens_list)]
+        return tokens_list, tagged_docs
+    def _vectorize_text(
+        self, doc2vec_model: Doc2Vec, df: pd.Series, tokens_list: List[str]
+    ) -> pd.DataFrame:
+        """
+        Convert  values of `tokens_list` to a vector.
+        Parameters:
+        -----------
+        - doc2vec_model : Doc2Vev
+            Trained Doc2Vec model.
+        - df : pd.Series
+            This will use only to get its indicies for the new generated dataframe.
+        - tokens_list : List[str]
+            Contains all tokens of each case.
+        Returns:
+        --------
+        - text_vectors_df : pd.DataFrame
+            Contains the vector representaion for each case.
+        """
+        text_vectors = [doc2vec_model.infer_vector(doc) for doc in tokens_list]
+        text_vectors_df = pd.DataFrame(text_vectors, index=df.index)
+        return text_vectors_df
+    def _anonymize_case_facts(
+        self, first_party_name: str, second_party_name: str, facts: str
+    ) -> str:
+        """
+        Anonymize case facts by replacing its party names with "_PARTY_" tag.
+        Parameters:
+        ------------
+        - first_party_name : str
+            Represents first party name or petitioner name.
+        - second_party_name : str
+            Represents second party name or respondent name.
+        - facts : str
+            Represents case facts.
+        Returns:
+        --------
+        - anonymized_facts : str
+            An anonymized version of `facts`.
+        """
+        # remove any commas and any non alphabet characters
+        first_party_name = re.sub(r"[\,+]", " ", first_party_name)
+        first_party_name = re.sub(r"[^a-zA-Z]", " ", first_party_name)
+        second_party_name = re.sub(r"[\,+]", " ", second_party_name)
+        second_party_name = re.sub(r"[^a-zA-Z]", " ", second_party_name)
+        for name in first_party_name.split():
+            facts = re.sub(name, " _PARTY_ ", facts)
+        for name in second_party_name.split():
+            facts = re.sub(name, " _PARTY_ ", facts)
+        # replace any consecutive _PARTY_ tags with only one _PARTY_ tag.
+        regex_continous_tags = r"(_PARTY_\s+){2,}"
+        anonymized_facts = re.sub(regex_continous_tags, " _PARTY_ ", facts)
+        # remove ant consecutive spaces
+        anonymized_facts = re.sub(r"\s+", " ", anonymized_facts)
+        return anonymized_facts
+    def _preprocess_text(self, text: str) -> str:
+        """
+        Preprocessing & cleaning `text` including:
+        - lowercasing
+        - removing quotation marks
+        - removing digits
+        - removing punctuation
+        - removing brackets, braces, and paranthesis
+        - removeing stopwords
+        - stemming tokens
+        Parameters:
+        ------------
+        - text : str
+            Text need to be processed (cleaned).
+        Returns:
+        --------
+        - processed_text : str
+            A preprocessed version of `text`.
+        """
+        text = text.lower()
+        # remove quotation marks
+        text = re.sub(r"\'", "", text)
+        # remove digits
+        text = re.sub(r"\d+", "", text)
+        # remove punctuation but with keeping '_' letter
+        text = "".join([ch for ch in text if (ch == "_") or (ch not in punct)])
+        # remove brackets, braces, and parantheses
+        text = re.sub(r"[\[\]\(\)\{\}]+", " ", text)
+        tokens = nltk.word_tokenize(text)
+        # remove stopwords and stemming tokens
+        tokens = [stemmer.stem(token)
+                  for token in tokens if token not in eng_stopwords]
+        # convert tokens back to string
+        processed_text = " ".join(tokens)
+        return processed_text
+    def convert_text_to_vectors_doc2vec(
+        self,
+        text_column: pd.Series,
+        train: bool = True,
+        embeddings_doc2vec: Doc2Vec = None,
+    ) -> Tuple[Doc2Vec, pd.DataFrame] | pd.DataFrame:
+        """
+        Converting `text_column` to vectors using `Doc2Vec` model
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains the case facts.
+        - train : bool, optional
+            Defines whether the model will be trained or not. (if True, Doc2Vec will be trained |
+            else, Doc2Vec will used the passed `embeddings_Doc2Vec`). (Default is True).
+        - embeddings_doc2vec : Doc2Vec, optional
+            Trained Doc2Vec model will be used for generating embeddings of `text_column` if
+            `train` is False. (Default is None).
+        Returns:
+        --------
+        1. A tuple contains the following:
+            - embeddings_doc2vec : Doc2Vec
+                Trained Doc2Vec model.
+            - text_vectors_df : pd.DataFrame
+                A DataFrame contains `text_column` vectors if `train` is True.
+        2. text_vectors_df : pd.DataFrame
+            A DataFrame contains `text_column` vectors if `train` is False.
+        Raises:
+        -------
+        - AssertionError
+            If train is False and `embeddings_doc2vec` is None.
+        - AssertionError
+            If train is False and `embedding_doc2vec` is not an instance of Doc2Vec
+        """
+        tokenized_text = self._tokenize_text(text_column)
+        tokens_list, tagged_docs = self._convert_to_tagged_document(
+            tokenized_text)
+        if train:
+            doc2vec_model = Doc2VecModel()
+            embeddings_doc2vec = doc2vec_model.train_doc2vec_embeddings_model(
+                tagged_docs
+            )
+            text_vectors_df = self._vectorize_text(
+                embeddings_doc2vec, text_column, tokens_list
+            )
+            return embeddings_doc2vec, text_vectors_df
+        assert (
+            embeddings_doc2vec is not None
+        ), "`embedding_doc2vec` argument must be not None."
+        assert isinstance(
+            embeddings_doc2vec, Doc2Vec
+        ), "`embedding_doc2vec` argument must be an instance of Doc2Vec to infer vectors."
+        text_vectors_df = self._vectorize_text(
+            embeddings_doc2vec, text_column, tokens_list
+        )
+        return text_vectors_df
+    def convert_text_to_vectors_tf_idf(
+        self,
+        text_column: pd.Series,
+        ngrams: int = 2,
+        max_tokens: int = 10000,
+        output_mode: str = "tf-idf",
+        train: bool = True,
+        text_vectorizer: TextVectorization = None,
+    ) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor:
+        """
+        Converting `text_column` to vectors using `TextVectorization` layer.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains the case facts.
+        - ngrams : int, optional
+            Defines the number of n-gram (Default is 2).
+        - max_tokens : int, optional
+            Defines the number of max_tokens of `text_vectorizer` (Default is 10,000).
+        - output_mode : str, optional
+            Represents the output vectors type whether it is "tfi-df" or "binary" or "count"
+            (Default is "tf-idf").
+        - train : bool, optional
+            Defines whether the model will be trained or not. (if True, TextVectorization
+            will be trained, else, TextVectorization will used the passed `text_vectorizer`).
+            (Default is True).
+        - text_vectorizer : TextVectorization, optional
+            Trained TextVectorization layer will be used for generating embeddings of
+            `text_column` if `train` is False. (Default is None).
+        Returns:
+        --------
+        - if `train` == True:
+            A tuple contains the following:
+                - text_vectorizer : TextVectorization
+                    Trained TextVectorization layer.
+                - text_vectors : tf.Tensor
+                    A Tensor contains `text_column` training vectors.
+        - otherwise:
+            text_vectors : tf.Tensor
+                A Tensor contains `text_column` testing vectors.
+        Raises:
+        -------
+        - AssertionError
+            If train is False and `text_vectorizer` is None.
+        - AssertionError
+            If train is False and `text_vectorizer` is not an instance of TextVectorization.
+        """
+        if train:
+            text_vectorizer = TextVectorization(
+                ngrams=ngrams, max_tokens=max_tokens, output_mode=output_mode
+            )
+            text_vectorizer.adapt(text_column)
+            text_vectors = text_vectorizer(text_column)
+            return text_vectorizer, text_vectors
+        assert (
+            text_vectorizer is not None
+        ), "`text_vectorizer` argument must be not None."
+        assert isinstance(
+            text_vectorizer, TextVectorization
+        ), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
+        text_vectors = text_vectorizer(text_column)
+        return text_vectors
+    def convert_text_to_vectors_cnn(
+        self,
+        text_column: pd.Series,
+        max_tokens: int = 2000,
+        output_sequence_length: int = 500,
+        output_mode: str = "int",
+        train: bool = True,
+        text_vectorizer: TextVectorization = None,
+    ) -> Tuple[TextVectorization, tf.Tensor] | tf.Tensor:
+        """
+        Converting `text_column` to vectors using `TextVectorization` layer.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains the case facts.
+        - max_tokens : int, optional
+            Defines the number of max_tokens of `text_vectorizer` (Default is 2000).
+        - output_sequence_length : int, optional
+            Represents the dimensions of the output vector (Default is 500).
+        - output_mode : str, optional
+            Represents the output vectors type whether it is "int" or "binary" or "tfi-df".
+        - train : bool, optional
+            Defines whether the model will be trained or not. (if True,
+            TextVectorization will be trained | else, TextVectorization will used the
+            passed `text_vectorizer`). (Default is True).
+        - text_vectorizer : TextVectorization, optional
+            Trained TextVectorization layer will be used for generating embeddings of
+             `text_column` if `train` is False. (Default is None).
+        Returns:
+        --------
+        - if `train` == True:
+            A tuple contains the following:
+                - text_vectorizer : TextVectorization
+                    Trained TextVectorization layer.
+                - text_vectors : tf.Tensor
+                    A Tensor contains `text_column` training vectors.
+        - otherwise:
+            text_vectors : tf.Tensor
+                A Tensor contains `text_column` testing vectors.
+        Raises:
+        -------
+        - AssertionError
+            If train is False and `text_vectorizer` is None.
+        - AssertionError
+            If train is False and `text_vectorizer` is not an instance of TextVectorization.
+        """
+        if train:
+            text_vectorizer = TextVectorization(
+                max_tokens=max_tokens,
+                output_mode=output_mode,
+                output_sequence_length=output_sequence_length,
+            )
+            text_vectorizer.adapt(text_column)
+            text_vectors = text_vectorizer(text_column)
+            return text_vectorizer, text_vectors
+        assert (
+            text_vectorizer is not None
+        ), "`text_vectorizer` argument must be not None."
+        assert isinstance(
+            text_vectorizer, TextVectorization
+        ), "`text_vectorizer` argument must be an instance of TextVectorization to infer vectors."
+        text_vectors = text_vectorizer(text_column)
+        return text_vectors
+    def convert_text_to_vectors_glove(
+        self,
+        text_column: pd.Series,
+        train: bool = True,
+        glove_tokenizer: Tokenizer = None,
+        vocab_size: int = 1000,
+        oov_token: str = "<OOV>",
+        max_length: int = 50,
+        padding_type: str = "post",
+        truncation_type: str = "post",
+    ) -> Tuple[Tokenizer, np.ndarray] | np.ndarray:
+        """
+        Converting `text_column` to vectors using `glove_tokenizer`.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains the case facts.
+        - train : bool, optional
+            Defines whether the model will be trained or not. (if True,
+            Tokenizer will be trained | else, Tokenizer will used the
+            passed `glove_tokenizer`). (Default is True).
+        - glove_tokenizer : Tokenizer, optional
+            Trained Tokenizer layer will be used for generating embeddings of
+             `text_column` if `train` is False. (Default is None).
+        - vocab_size : int, optional
+            Represents the number of supported vocabulary of the Tokenizer,
+            any token not in this vocabulary will be treated as an out-of-vocabulary
+            token(OOV). (Default is 1000).
+        - oov_tokens : str, optional
+            Represents the token of an out-of-vocabulary token (Default is "<OOV>").
+        - max_length : int, optional
+            Defins the output vector's dimension. (Default is 50).
+        - padding_type : str, optional
+            Defines the padding type of the vectors, if the vector size is less than
+            `max_length`, the rest of the `max_length` will be padded with 0 (Default is "post").
+        - truncation_type : str, optional
+            Defines the truncation type of the vectors, if the vector size is more than
+            `max_length`, the extra of the `max_length` will be truncated (Default is "post").
+        Returns:
+        --------
+        - if `train` == True:
+            A tuple contains the following:
+                - glove_tokenizer : Tokenizer
+                    Trained Tokenizer layer.
+                - text_padded : np.ndarray
+                    An array contains `text_column` vectors.
+        - otherwise:
+            text_padded : np.ndarray
+                An array contains `text_column` vectors.
+        Raises:
+        -------
+        - AssertionError
+            If train is False and `glove_tokenizer` is None.
+        - AssertionError
+            If train is False and `glove_tokenizer` is not instance of Tokenizer.
+        """
+        if train:
+            glove_tokenizer = Tokenizer(
+                num_words=vocab_size, oov_token=oov_token)
+            glove_tokenizer.fit_on_texts(text_column)
+            text_sequences = glove_tokenizer.texts_to_sequences(text_column)
+            text_padded = pad_sequences(
+                text_sequences,
+                maxlen=max_length,
+                padding=padding_type,
+                truncating=truncation_type,
+            )
+            return glove_tokenizer, text_padded
+        assert (
+            glove_tokenizer is not None
+        ), "`glove_tokenizer` argument must be not None."
+        assert isinstance(
+            glove_tokenizer, Tokenizer
+        ), "`glove_tokenizer` argument must be an instance of Tokenizer."
+        text_sequences = glove_tokenizer.texts_to_sequences(text_column)
+        text_padded = pad_sequences(
+            text_sequences,
+            maxlen=max_length,
+            padding=padding_type,
+            truncating=truncation_type,
+        )
+        return text_padded
+    def balance_data(self, X_train: pd.Series, y_train: pd.Series) -> pd.DataFrame:
+        """
+        Balancing `X_train` and `y_train` to distribute the targets in `y_train` equally.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+             Contains the case facts.
+         - y_train : pd.Series
+             Contains the training targets.
+         Returns:
+         --------
+         -  shuffled_balanced_df : pd.DataFrame
+             Contains the new balanced dataframe with shuffling indicies.
+        """
+        df = pd.concat([X_train, y_train], axis=1)
+        first_party = df[df["winner_index"] == 0]
+        second_party = df[df["winner_index"] == 1]
+        upsample_second_party = resample(
+            second_party, replace=True, n_samples=len(first_party), random_state=42
+        )
+        upsample_df = pd.concat([upsample_second_party, first_party])
+        shuffled_indices = np.arange(upsample_df.shape[0])
+        np.random.shuffle(shuffled_indices)
+        shuffled_balanced_df = upsample_df.iloc[shuffled_indices, :]
+        return shuffled_balanced_df
+    def anonymize_data(
+        self,
+        first_party_names: pd.Series,
+        second_party_names: pd.Series,
+        text_column: pd.Series,
+    ) -> pd.Series:
+        """
+        Anonymize `text_column` by replacing `first_party_names` and
+        `second_party_names` wit "_PARTY_" tag.
+        Parameters:
+        ------------
+        - first_party_names : pd.Series
+            Contains all first party names needed to be anonymized.
+        - second_party_names : pd.Series
+            Contains all second party names needed to be anonymized.
+        - text_column : pd.Series
+            Contains all texts needed to be anonymized.
+        Returns:
+        --------
+        - all_anonyimzed_facts : pd.Series
+            Contains anonymized version of `text_column`.
+        """
+        all_anonymized_facts = []
+        for i in range(text_column.shape[0]):
+            facts = text_column.iloc[i]
+            first_party_name = first_party_names.iloc[i]
+            second_party_name = second_party_names.iloc[i]
+            anonymized_facts = self._anonymize_case_facts(
+                first_party_name, second_party_name, facts
+            )
+            all_anonymized_facts.append(anonymized_facts)
+        return pd.Series(all_anonymized_facts)
+    def preprocess_data(self, text_column: pd.Series) -> pd.Series:
+        """
+        Preprocessing & cleaning all texts in `text_column`.
+        Parameters:
+        ------------
+        - text_column : pd.Series
+            Contains all case facts.
+        Returns:
+        --------
+        - preprocessed_text : pd.Series
+            Contains all texts after being processed.
+        """
+        preprocessed_text = text_column.apply(self._preprocess_text)
+        return preprocessed_text

style.css ADDED Viewed

	@@ -0,0 +1,94 @@

+@import url('https://fonts.googleapis.com/css2?family=Cairo:wght@300;400;500;600;700;800&display=swap');
+* {
+    font-family: 'Cairo', sans-serif !important;
+}
+/* title */
+.e16nr0p30 {
+    font-weight: 700;
+    font-size: 30px;
+}
+/* buttons */
+.edgvbvh10,
+.edgvbvh5 {
+    width: 100%;
+    height: 40px;
+    background-color: #4756ff;
+    color: #fff;
+    transition: 0.4s;
+    border: none;
+}
+.edgvbvh10:hover,
+.edgvbvh5:hover {
+    background-color: #3747fd;
+    color: #fff;
+    border: none;
+}
+.edgvbvh10:focus,
+.edgvbvh5:focus {
+    background-color: #3747fd;
+    color: #fff !important;
+    box-shadow: none;
+    border: none;
+}
+/* header */
+.row_heading {
+    font-size: 14px;
+}
+/* spinner */
+.css-1y04v0k.e17lx80j1,
+.css-p6380s.e17lx80j1 {
+    margin: 0px;
+    border-color: #34e27f #b3b3b333 #cacaca33 !important;
+    -webkit-box-flex: 0;
+    flex-grow: 0;
+    flex-shrink: 0;
+}
+/* inputs styling */
+.st-bf {
+    transition: 0.8s;
+    border: none !important;
+}
+.st-bf:hover {
+    box-shadow: 0 0 0 4px #dbdbdb !important;
+}
+/* text stylings */
+.highlight-petitioner {
+    border-radius: 0.4rem;
+    background-color: rgba(253, 231, 142, 0.4);
+    color: #ffd061;
+    padding: 1px 5px;
+    margin-top: 10px;
+    margin-right: 5px;
+}
+.highlight-respondent {
+    border-radius: 0.4rem;
+    background-color: rgba(78, 170, 255, 0.2);
+    color: #6195ff;
+    padding: 1px 5px;
+    margin-top: 10px;
+    margin-right: 5px;
+}
+.bold-text {
+    font-weight: 700 !important;
+}
+.text-facts {
+    line-height: 40px;
+}
+/* footer */
+footer {
+    display: none !important;
+}

utils.py ADDED Viewed

	@@ -0,0 +1,389 @@

+from typing import Callable, List, Tuple
+import numpy as np
+import pandas as pd
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+import tensorflow as tf
+from tensorflow import keras
+from keras.preprocessing.text import Tokenizer
+def read_data(filepath="../csvs/"):
+    """
+    Reading CSV files of the dataset.
+    Parameters:
+    ----------
+    - filepath : str
+        Defines the path that contains the CSV files.
+    Returns:
+    --------
+    A tuple contains the following:
+        - X_train : pd.DataFrame
+        - y_train : pd.Series
+        - X_test : pd.DataFrame
+        - y_test : pd.Series
+    """
+    X_train = pd.read_csv(filepath + "X_train.csv")
+    X_train = X_train.iloc[:, 1:]
+    X_test = pd.read_csv(filepath + "X_test.csv")
+    X_test = X_test.iloc[:, 1:]
+    y_train = pd.read_csv(filepath + "y_train.csv")
+    y_train = y_train.iloc[:, 1:]
+    y_test = pd.read_csv(filepath + "y_test.csv")
+    y_test = y_test.iloc[:, 1:]
+    return X_train, X_test, y_train, y_test
+def train_model(
+    model_building_func: Callable[[], keras.models.Sequential],
+    X_train_vectors: pd.DataFrame | np.ndarray | tf.Tensor,
+    y_train: pd.Series,
+    k: int = 4,
+    num_epochs: int = 30,
+    batch_size: int = 64,
+) -> Tuple[
+    List[keras.models.Sequential],
+    List[List[float]],
+    List[List[float]],
+    List[List[float]],
+    List[List[float]],
+]:
+    """
+    Trains a model on `X_train_vectors` and `y_train` using k-fold cross-validation.
+    Parameters:
+    -----------
+    - model_building_func : Callable[[], tf.keras.models.Sequential]
+        A function that builds and compiles a Keras Sequential model.
+    - X_train_vectors : pd.DataFrame
+        The training input data.
+    - y_train : pd.Series
+        The training target data.
+    - k : int, optional
+        The number of folds for cross-validation (default is 4).
+    - num_epochs : int, optional
+        The number of epochs to train for (default is 30).
+    - batch_size : int, optional
+        The batch size to use during training (default is 64).
+    Returns:
+    --------
+    A tuple containing the following items:
+        - all_models : List[keras.models.Sequential]
+            A list of `k` trained models.
+        - all_losses : List[List[float]]
+            A `k` by `num_epochs` list containing the training losses for each fold.
+        - all_val_losses : List[List[float]]
+            A `k` by `num_epochs` list containing the validation losses for each fold.
+        - all_acc : List[List[float]]
+            A `k` by `num_epochs` list containing the training accuracies for each fold.
+        - all_val_acc : List[List[float]]
+            A `k` by `num_epochs` list containing the validation accuracies for each fold.
+    """
+    num_validation_samples = len(X_train_vectors) // k
+    all_models = []
+    all_losses = []
+    all_val_losses = []
+    all_accuracies = []
+    all_val_accuracies = []
+    for fold in range(k):
+        print(f"fold: {fold+1}")
+        validation_data = X_train_vectors[
+            num_validation_samples * fold : num_validation_samples * (fold + 1)
+        ]
+        validation_targets = y_train[
+            num_validation_samples * fold : num_validation_samples * (fold + 1)
+        ]
+        training_data = np.concatenate(
+            [
+                X_train_vectors[: num_validation_samples * fold],
+                X_train_vectors[num_validation_samples * (fold + 1) :],
+            ]
+        )
+        training_targets = np.concatenate(
+            [
+                y_train[: num_validation_samples * fold],
+                y_train[num_validation_samples * (fold + 1) :],
+            ]
+        )
+        model = model_building_func()
+        history = model.fit(
+            training_data,
+            training_targets,
+            validation_data=(validation_data, validation_targets),
+            epochs=num_epochs,
+            batch_size=batch_size,
+        )
+        all_models.append(model)
+        all_losses.append(history.history["loss"])
+        all_val_losses.append(history.history["val_loss"])
+        all_accuracies.append(history.history["accuracy"])
+        all_val_accuracies.append(history.history["val_accuracy"])
+    return (all_models, all_losses, all_val_losses, all_accuracies, all_val_accuracies)
+def print_testing_loss_accuracy(
+    all_models: List[keras.models.Sequential],
+    X_test_vectors: pd.DataFrame | np.ndarray | tf.Tensor,
+    y_test: pd.Series,
+) -> None:
+    """
+    Displaying testing loss and testing accuracy of each model in `all_models`,
+    and displaying their average.
+    Parameters:
+    ------------
+    - all_models : List[keras.models.Sequential]
+        A list of size `k` contains trained models.
+    - X_test_vectors : pd.DataFrame
+        Contains testing vectors.
+    - y_test : pd.Series
+        Contains testing labels.
+    """
+    sum_testing_losses = 0.0
+    sum_testing_accuracies = 0.0
+    for i, model in enumerate(all_models):
+        print(f"model: {i+1}")
+        loss_accuracy = model.evaluate(X_test_vectors, y_test, verbose=1)
+        sum_testing_losses += loss_accuracy[0]
+        sum_testing_accuracies += loss_accuracy[1]
+        print("====" * 20)
+    num_models = len(all_models)
+    avg_testing_loss = sum_testing_losses / num_models
+    avg_testing_acc = sum_testing_accuracies / num_models
+    print(f"average testing loss: {avg_testing_loss:.3f}")
+    print(f"average testing accuracy: {avg_testing_acc:.3f}")
+def calculate_average_measures(
+    all_losses: list[list[float]],
+    all_val_losses: list[list[float]],
+    all_accuracies: list[list[float]],
+    all_val_accuracies: list[list[float]],
+) -> Tuple[
+    List[keras.models.Sequential],
+    List[List[float]],
+    List[List[float]],
+    List[List[float]],
+    List[List[float]],
+]:
+    """
+    Calculate the average measures of cross-validated results.
+    Parameters:
+    ------------
+    - all_losses : List[List[float]]
+        A `k` by `num_epochs` list contains the values of training losses.
+    - all_val_losses : List[List[float]]
+        A `k` by `num_epochs` list contains the values of validation losses.
+    - all_accuracies : List[List[float]]
+        A `k` by `num_epochs` list contains the values of training accuracies.
+    - all_val_accuracies : List[List[float]]
+        A `k` by `num_epochs` list contains the values of validation accuracies.
+    Returns:
+    --------
+    A tuple containing the following items:
+        - avg_loss_hist : List[float]
+            A list of length `num_epochs` contains the average of training losses.
+        - avg_val_loss_hist : List[float]
+            A list of length `num_epochs` contains the average of validaton losses.
+        - avg_acc_hist : List[float]
+            A list of length `num_epochs` contains the average of training accuracies.
+        - avg_val_acc_hist : List[float]
+            A list of length `num_epochs` contains the average of validation accuracies.
+    """
+    num_epochs = len(all_losses[0])
+    avg_loss_hist = [np.mean([x[i] for x in all_losses]) for i in range(num_epochs)]
+    avg_val_loss_hist = [
+        np.mean([x[i] for x in all_val_losses]) for i in range(num_epochs)
+    ]
+    avg_acc_hist = [np.mean([x[i] for x in all_accuracies]) for i in range(num_epochs)]
+    avg_val_acc_hist = [
+        np.mean([x[i] for x in all_val_accuracies]) for i in range(num_epochs)
+    ]
+    return (avg_loss_hist, avg_val_loss_hist, avg_acc_hist, avg_val_acc_hist)
+class Doc2VecModel:
+    """Responsible of creating, initializing, and training Doc2Vec embeddings model."""
+    def __init__(self, vector_size=50, min_count=2, epochs=100, dm=1, window=5) -> None:
+        """
+        Initalize a Doc2Vec model.
+        Parameters:
+        ------------
+        - vector_size : int, optional
+            Dimensionality of the feature vectors (Default is 50).
+        - min_count : int, optional
+            Ignores all words with total frequency lower than this (Default is 2).
+        - epochs : int, optional
+            Represents the number of training epochs (Default is 100).
+        - dm : int, optional
+            Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
+            Otherwise, `distributed bag of words` (PV-DBOW) is employed (Default is 1).
+        - window : int, optional
+            The maximum distance between the current and predicted word within a
+            sentence (Default is 5).
+        """
+        self.doc2vec_model = Doc2Vec(
+            vector_size=vector_size,
+            min_count=min_count,
+            epochs=epochs,
+            dm=dm,
+            seed=865,
+            window=window,
+        )
+    def train_doc2vec_embeddings_model(
+        self, tagged_docs_train: List[TaggedDocument]
+    ) -> Doc2Vec:
+        """
+        Train Doc2Vec model on `tagged_docs_train`.
+        Parameters:
+        ------------
+        - tagged_docs_train : list[TaggedDocument]
+            Contains the required format of training Doc2Vec model.
+        Returns:
+        --------
+        - doc2vec_model : Doc2Vec
+            The trained Doc2Vec model.
+        """
+        self.doc2vec_model.build_vocab(tagged_docs_train)
+        self.doc2vec_model.train(
+            tagged_docs_train,
+            total_examples=self.doc2vec_model.corpus_count,
+            epochs=self.doc2vec_model.epochs,
+        )
+        return self.doc2vec_model
+class GloveModel:
+    """Responsible for creating and generating the glove embedding layer"""
+    def __init__(self) -> None:
+        pass
+    def _generate_glove_embedding_index(
+        self, glove_file_path: str = "GloVe/glove.6B.50d.txt"
+    ) -> dict:
+        """
+        Responsible for generating glove embedding index.
+        Parameters:
+        ------------
+        - glove_file_path : str
+            Defines the path of the pretrained GloVe embeddings text file
+            (Default is "GloVe/glove.6B.50d.txt").
+        Returns:
+        --------
+        - embedding_index : dict
+            Contains each word as a key, and its co-effeicents as a value.
+        """
+        embeddings_index = {}
+        with open(glove_file_path, encoding="utf8") as f:
+            for line in f:
+                values = line.split()
+                word = values[0]
+                coefs = np.asarray(values[1:], dtype="float32")
+                embeddings_index[word] = coefs
+        return embeddings_index
+    def _generate_glove_embedding_matrix(
+        self, word_index: dict, embedding_index: dict, max_length: int
+    ) -> np.ndarray:
+        """
+        Generating embedding matrix of each word in `word_index`.
+        Parameters:
+        -----------
+        - word_index : dict
+            Contains words as keys with there indicies as values.
+        - embedding_index : dict
+            Contains each word as a key, and its co-effeicents as a value.
+        - max_length : int
+            Defines the size of the embedding vector of each word in the
+            embedding matrix.
+        Returns:
+        --------
+        - embedding_matrix : np.ndarray
+            Contains all embedding vectors for each word in`word_index`.
+        """
+        embedding_matrix = np.zeros((len(word_index) + 1, max_length))
+        for word, i in word_index.items():
+            embedding_vector = embedding_index.get(word)
+            if embedding_vector is not None:
+                embedding_matrix[i] = embedding_vector
+        return embedding_matrix
+    def generate_glove_embedding_layer(
+        self, glove_tokenizer: Tokenizer, max_length: int = 50
+    ) -> keras.layers.Embedding:
+        """
+        Create GloVe embedding layer for later usage in the neural network.
+        Paramters:
+        ----------
+        - glove_tokenizer : Tokenizer
+            Trained tokenizer on training data to extract word index from it.
+        - max_length : int, optional
+            Defines the maximum length of the output embedding vector for
+            each word. (Default is 50).
+        Returns:
+        --------
+        - embedding_layer : keras.layers.Embedding
+            An embedding layer of size `word index + 1` by `max_length` with
+            trained weights that can be used a vectorizer of case facts.
+        """
+        word_index = glove_tokenizer.word_index
+        embedding_index = self._generate_glove_embedding_index()
+        embedding_matrix = self._generate_glove_embedding_matrix(
+            word_index, embedding_index, max_length
+        )
+        embedding_layer = keras.layers.Embedding(
+            len(word_index) + 1,
+            max_length,
+            weights=[embedding_matrix],
+            input_length=max_length,
+            trainable=False,
+        )
+        return embedding_layer