{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sn\n", "\n", "from tensorflow import keras\n", "from gensim.models.doc2vec import Doc2Vec\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import classification_report\n", "\n", "from src.preprocessing import Preprocessor\n", "from src.utils import read_data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "doc2vec_model_embeddings = Doc2Vec.load(\n", " \"models/best_doc2vec_embeddings\")\n", "doc2vec_model = keras.models.load_model(\n", " \"models/best_doc2vec_model.h5\")\n", "tfidf_model = keras.models.load_model(\"models/best_tfidf_model.h5\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = read_data()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "preprocessor = Preprocessor()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Best combination of TF-IDF model" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "first_party_names_3 = X_train[\"first_party\"]\n", "second_party_names_3 = X_train[\"second_party\"]\n", "facts_3 = X_train[\"Facts\"]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "test_first_party_names_3 = X_test[\"first_party\"]\n", "test_second_party_names_3 = X_test[\"second_party\"]\n", "test_facts_3 = X_test[\"Facts\"]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "anonymized_facts_3 = preprocessor.anonymize_data(\n", " first_party_names_3, second_party_names_3, facts_3)\n", "test_anonymized_facts_3 = preprocessor.anonymize_data(\n", " test_first_party_names_3, test_second_party_names_3, test_facts_3)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "text_vectorizer_3, X_train_vectors_3 = preprocessor.convert_text_to_vectors_tf_idf(\n", " anonymized_facts_3)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "X_test_vectors_3 = preprocessor.convert_text_to_vectors_tf_idf(\n", " test_anonymized_facts_3, train=False, text_vectorizer=text_vectorizer_3)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "22/22 [==============================] - 1s 9ms/step\n" ] } ], "source": [ "y_preds_tfidf = tfidf_model.predict(X_test_vectors_3)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TF-IDF accuracy: 0.973\n" ] } ], "source": [ "preds = np.where(y_preds_tfidf > 0.5, 1, 0)\n", "accuracy = accuracy_score(y_test, preds)\n", "print(\"TF-IDF accuracy: {:.3f}\".format(accuracy))" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Best Combination of Doc2Vec Model" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "X_test_processed = preprocessor.preprocess_data(X_test[\"Facts\"])" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "X_test_vectors_doc2vec = preprocessor.convert_text_to_vectors_doc2vec(\n", " X_test_processed, train=False, embeddings_doc2vec=doc2vec_model_embeddings)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "22/22 [==============================] - 0s 2ms/step\n" ] } ], "source": [ "y_preds_doc2vec = doc2vec_model.predict(X_test_vectors_doc2vec)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Doc2Vec accuracy: 0.944\n" ] } ], "source": [ "preds_2 = np.where(y_preds_doc2vec > 0.5, 1, 0)\n", "accuracy = accuracy_score(y_test, preds_2)\n", "print(\"Doc2Vec accuracy: {:.3f}\".format(accuracy))" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Combining both accuracies" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Voting accuracy: 0.986\n" ] } ], "source": [ "voting_predcitions = (y_preds_doc2vec + y_preds_tfidf) / 2\n", "voting_predcitions = np.where(voting_predcitions > 0.5, 1, 0)\n", "accuracy = accuracy_score(y_test, voting_predcitions)\n", "print(\"Voting accuracy: {:.3f}\".format(accuracy))" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "cf = confusion_matrix(y_test, voting_predcitions)\n", "sn.heatmap(cf, annot=True)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.98 1.00 0.99 423\n", " 1 1.00 0.96 0.98 270\n", "\n", " accuracy 0.99 693\n", " macro avg 0.99 0.98 0.98 693\n", "weighted avg 0.99 0.99 0.99 693\n", "\n" ] } ], "source": [ "cls_report = classification_report(y_test, voting_predcitions)\n", "print(cls_report)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.1" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }