|
from typing import Callable, List, Tuple |
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
from gensim.models.doc2vec import Doc2Vec, TaggedDocument |
|
|
|
import tensorflow as tf |
|
from tensorflow import keras |
|
from keras.preprocessing.text import Tokenizer |
|
|
|
|
|
def read_data(filepath="../csvs/"): |
|
""" |
|
Reading CSV files of the dataset. |
|
|
|
Parameters: |
|
---------- |
|
- filepath : str |
|
Defines the path that contains the CSV files. |
|
|
|
Returns: |
|
-------- |
|
A tuple contains the following: |
|
- X_train : pd.DataFrame |
|
- y_train : pd.Series |
|
- X_test : pd.DataFrame |
|
- y_test : pd.Series |
|
""" |
|
|
|
X_train = pd.read_csv(filepath + "X_train.csv") |
|
X_train = X_train.iloc[:, 1:] |
|
|
|
X_test = pd.read_csv(filepath + "X_test.csv") |
|
X_test = X_test.iloc[:, 1:] |
|
|
|
y_train = pd.read_csv(filepath + "y_train.csv") |
|
y_train = y_train.iloc[:, 1:] |
|
|
|
y_test = pd.read_csv(filepath + "y_test.csv") |
|
y_test = y_test.iloc[:, 1:] |
|
|
|
return X_train, X_test, y_train, y_test |
|
|
|
|
|
def train_model( |
|
model_building_func: Callable[[], keras.models.Sequential], |
|
X_train_vectors: pd.DataFrame | np.ndarray | tf.Tensor, |
|
y_train: pd.Series, |
|
k: int = 4, |
|
num_epochs: int = 30, |
|
batch_size: int = 64, |
|
) -> Tuple[ |
|
List[keras.models.Sequential], |
|
List[List[float]], |
|
List[List[float]], |
|
List[List[float]], |
|
List[List[float]], |
|
]: |
|
""" |
|
Trains a model on `X_train_vectors` and `y_train` using k-fold cross-validation. |
|
|
|
Parameters: |
|
----------- |
|
- model_building_func : Callable[[], tf.keras.models.Sequential] |
|
A function that builds and compiles a Keras Sequential model. |
|
- X_train_vectors : pd.DataFrame |
|
The training input data. |
|
- y_train : pd.Series |
|
The training target data. |
|
- k : int, optional |
|
The number of folds for cross-validation (default is 4). |
|
- num_epochs : int, optional |
|
The number of epochs to train for (default is 30). |
|
- batch_size : int, optional |
|
The batch size to use during training (default is 64). |
|
|
|
Returns: |
|
-------- |
|
A tuple containing the following items: |
|
- all_models : List[keras.models.Sequential] |
|
A list of `k` trained models. |
|
- all_losses : List[List[float]] |
|
A `k` by `num_epochs` list containing the training losses for each fold. |
|
- all_val_losses : List[List[float]] |
|
A `k` by `num_epochs` list containing the validation losses for each fold. |
|
- all_acc : List[List[float]] |
|
A `k` by `num_epochs` list containing the training accuracies for each fold. |
|
- all_val_acc : List[List[float]] |
|
A `k` by `num_epochs` list containing the validation accuracies for each fold. |
|
""" |
|
|
|
num_validation_samples = len(X_train_vectors) // k |
|
|
|
all_models = [] |
|
all_losses = [] |
|
all_val_losses = [] |
|
all_accuracies = [] |
|
all_val_accuracies = [] |
|
|
|
for fold in range(k): |
|
print(f"fold: {fold+1}") |
|
validation_data = X_train_vectors[ |
|
num_validation_samples * fold : num_validation_samples * (fold + 1) |
|
] |
|
validation_targets = y_train[ |
|
num_validation_samples * fold : num_validation_samples * (fold + 1) |
|
] |
|
|
|
training_data = np.concatenate( |
|
[ |
|
X_train_vectors[: num_validation_samples * fold], |
|
X_train_vectors[num_validation_samples * (fold + 1) :], |
|
] |
|
) |
|
training_targets = np.concatenate( |
|
[ |
|
y_train[: num_validation_samples * fold], |
|
y_train[num_validation_samples * (fold + 1) :], |
|
] |
|
) |
|
|
|
model = model_building_func() |
|
history = model.fit( |
|
training_data, |
|
training_targets, |
|
validation_data=(validation_data, validation_targets), |
|
epochs=num_epochs, |
|
batch_size=batch_size, |
|
) |
|
|
|
all_models.append(model) |
|
all_losses.append(history.history["loss"]) |
|
all_val_losses.append(history.history["val_loss"]) |
|
all_accuracies.append(history.history["accuracy"]) |
|
all_val_accuracies.append(history.history["val_accuracy"]) |
|
|
|
return (all_models, all_losses, all_val_losses, all_accuracies, all_val_accuracies) |
|
|
|
|
|
def print_testing_loss_accuracy( |
|
all_models: List[keras.models.Sequential], |
|
X_test_vectors: pd.DataFrame | np.ndarray | tf.Tensor, |
|
y_test: pd.Series, |
|
) -> None: |
|
""" |
|
Displaying testing loss and testing accuracy of each model in `all_models`, |
|
and displaying their average. |
|
|
|
Parameters: |
|
------------ |
|
- all_models : List[keras.models.Sequential] |
|
A list of size `k` contains trained models. |
|
- X_test_vectors : pd.DataFrame |
|
Contains testing vectors. |
|
- y_test : pd.Series |
|
Contains testing labels. |
|
""" |
|
|
|
sum_testing_losses = 0.0 |
|
sum_testing_accuracies = 0.0 |
|
|
|
for i, model in enumerate(all_models): |
|
print(f"model: {i+1}") |
|
loss_accuracy = model.evaluate(X_test_vectors, y_test, verbose=1) |
|
sum_testing_losses += loss_accuracy[0] |
|
sum_testing_accuracies += loss_accuracy[1] |
|
print("====" * 20) |
|
|
|
num_models = len(all_models) |
|
avg_testing_loss = sum_testing_losses / num_models |
|
avg_testing_acc = sum_testing_accuracies / num_models |
|
print(f"average testing loss: {avg_testing_loss:.3f}") |
|
print(f"average testing accuracy: {avg_testing_acc:.3f}") |
|
|
|
|
|
def calculate_average_measures( |
|
all_losses: list[list[float]], |
|
all_val_losses: list[list[float]], |
|
all_accuracies: list[list[float]], |
|
all_val_accuracies: list[list[float]], |
|
) -> Tuple[ |
|
List[keras.models.Sequential], |
|
List[List[float]], |
|
List[List[float]], |
|
List[List[float]], |
|
List[List[float]], |
|
]: |
|
""" |
|
Calculate the average measures of cross-validated results. |
|
|
|
Parameters: |
|
------------ |
|
- all_losses : List[List[float]] |
|
A `k` by `num_epochs` list contains the values of training losses. |
|
- all_val_losses : List[List[float]] |
|
A `k` by `num_epochs` list contains the values of validation losses. |
|
- all_accuracies : List[List[float]] |
|
A `k` by `num_epochs` list contains the values of training accuracies. |
|
- all_val_accuracies : List[List[float]] |
|
A `k` by `num_epochs` list contains the values of validation accuracies. |
|
|
|
Returns: |
|
-------- |
|
A tuple containing the following items: |
|
- avg_loss_hist : List[float] |
|
A list of length `num_epochs` contains the average of training losses. |
|
- avg_val_loss_hist : List[float] |
|
A list of length `num_epochs` contains the average of validaton losses. |
|
- avg_acc_hist : List[float] |
|
A list of length `num_epochs` contains the average of training accuracies. |
|
- avg_val_acc_hist : List[float] |
|
A list of length `num_epochs` contains the average of validation accuracies. |
|
""" |
|
|
|
num_epochs = len(all_losses[0]) |
|
avg_loss_hist = [np.mean([x[i] for x in all_losses]) for i in range(num_epochs)] |
|
avg_val_loss_hist = [ |
|
np.mean([x[i] for x in all_val_losses]) for i in range(num_epochs) |
|
] |
|
avg_acc_hist = [np.mean([x[i] for x in all_accuracies]) for i in range(num_epochs)] |
|
avg_val_acc_hist = [ |
|
np.mean([x[i] for x in all_val_accuracies]) for i in range(num_epochs) |
|
] |
|
|
|
return (avg_loss_hist, avg_val_loss_hist, avg_acc_hist, avg_val_acc_hist) |
|
|
|
|
|
class Doc2VecModel: |
|
"""Responsible of creating, initializing, and training Doc2Vec embeddings model.""" |
|
|
|
def __init__(self, vector_size=50, min_count=2, epochs=100, dm=1, window=5) -> None: |
|
""" |
|
Initalize a Doc2Vec model. |
|
|
|
Parameters: |
|
------------ |
|
- vector_size : int, optional |
|
Dimensionality of the feature vectors (Default is 50). |
|
- min_count : int, optional |
|
Ignores all words with total frequency lower than this (Default is 2). |
|
- epochs : int, optional |
|
Represents the number of training epochs (Default is 100). |
|
- dm : int, optional |
|
Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. |
|
Otherwise, `distributed bag of words` (PV-DBOW) is employed (Default is 1). |
|
- window : int, optional |
|
The maximum distance between the current and predicted word within a |
|
sentence (Default is 5). |
|
""" |
|
|
|
self.doc2vec_model = Doc2Vec( |
|
vector_size=vector_size, |
|
min_count=min_count, |
|
epochs=epochs, |
|
dm=dm, |
|
seed=865, |
|
window=window, |
|
) |
|
|
|
def train_doc2vec_embeddings_model( |
|
self, tagged_docs_train: List[TaggedDocument] |
|
) -> Doc2Vec: |
|
""" |
|
Train Doc2Vec model on `tagged_docs_train`. |
|
|
|
Parameters: |
|
------------ |
|
- tagged_docs_train : list[TaggedDocument] |
|
Contains the required format of training Doc2Vec model. |
|
|
|
Returns: |
|
-------- |
|
- doc2vec_model : Doc2Vec |
|
The trained Doc2Vec model. |
|
""" |
|
|
|
self.doc2vec_model.build_vocab(tagged_docs_train) |
|
self.doc2vec_model.train( |
|
tagged_docs_train, |
|
total_examples=self.doc2vec_model.corpus_count, |
|
epochs=self.doc2vec_model.epochs, |
|
) |
|
|
|
return self.doc2vec_model |
|
|
|
|
|
class GloveModel: |
|
"""Responsible for creating and generating the glove embedding layer""" |
|
|
|
def __init__(self) -> None: |
|
pass |
|
|
|
def _generate_glove_embedding_index( |
|
self, glove_file_path: str = "GloVe/glove.6B.50d.txt" |
|
) -> dict: |
|
""" |
|
Responsible for generating glove embedding index. |
|
|
|
Parameters: |
|
------------ |
|
- glove_file_path : str |
|
Defines the path of the pretrained GloVe embeddings text file |
|
(Default is "GloVe/glove.6B.50d.txt"). |
|
|
|
Returns: |
|
-------- |
|
- embedding_index : dict |
|
Contains each word as a key, and its co-effeicents as a value. |
|
""" |
|
|
|
embeddings_index = {} |
|
with open(glove_file_path, encoding="utf8") as f: |
|
for line in f: |
|
values = line.split() |
|
word = values[0] |
|
coefs = np.asarray(values[1:], dtype="float32") |
|
embeddings_index[word] = coefs |
|
|
|
return embeddings_index |
|
|
|
def _generate_glove_embedding_matrix( |
|
self, word_index: dict, embedding_index: dict, max_length: int |
|
) -> np.ndarray: |
|
""" |
|
Generating embedding matrix of each word in `word_index`. |
|
|
|
Parameters: |
|
----------- |
|
- word_index : dict |
|
Contains words as keys with there indicies as values. |
|
- embedding_index : dict |
|
Contains each word as a key, and its co-effeicents as a value. |
|
- max_length : int |
|
Defines the size of the embedding vector of each word in the |
|
embedding matrix. |
|
|
|
Returns: |
|
-------- |
|
- embedding_matrix : np.ndarray |
|
Contains all embedding vectors for each word in`word_index`. |
|
""" |
|
|
|
embedding_matrix = np.zeros((len(word_index) + 1, max_length)) |
|
|
|
for word, i in word_index.items(): |
|
embedding_vector = embedding_index.get(word) |
|
if embedding_vector is not None: |
|
embedding_matrix[i] = embedding_vector |
|
|
|
return embedding_matrix |
|
|
|
def generate_glove_embedding_layer( |
|
self, glove_tokenizer: Tokenizer, max_length: int = 50 |
|
) -> keras.layers.Embedding: |
|
""" |
|
Create GloVe embedding layer for later usage in the neural network. |
|
|
|
Paramters: |
|
---------- |
|
- glove_tokenizer : Tokenizer |
|
Trained tokenizer on training data to extract word index from it. |
|
- max_length : int, optional |
|
Defines the maximum length of the output embedding vector for |
|
each word. (Default is 50). |
|
|
|
Returns: |
|
-------- |
|
- embedding_layer : keras.layers.Embedding |
|
An embedding layer of size `word index + 1` by `max_length` with |
|
trained weights that can be used a vectorizer of case facts. |
|
""" |
|
|
|
word_index = glove_tokenizer.word_index |
|
|
|
embedding_index = self._generate_glove_embedding_index() |
|
embedding_matrix = self._generate_glove_embedding_matrix( |
|
word_index, embedding_index, max_length |
|
) |
|
|
|
embedding_layer = keras.layers.Embedding( |
|
len(word_index) + 1, |
|
max_length, |
|
weights=[embedding_matrix], |
|
input_length=max_length, |
|
trainable=False, |
|
) |
|
|
|
return embedding_layer |
|
|