{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "# reading dataset files\n",
    "muslim_file = \"../data/right/Haqiqatjou.csv\"\n",
    "muslim_df = pd.read_csv(muslim_file)\n",
    "\n",
    "\n",
    "sanghi_file = \"../data/wrong/MrSinha_.csv\"\n",
    "sanghi_df = pd.read_csv(sanghi_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset formatting\n",
    "muslim_df['category'] = 'MUSLIM'\n",
    "\n",
    "sanghi_df['category'] = 'SANGHI'\n",
    "\n",
    "\n",
    "frames = [muslim_df, sanghi_df]\n",
    "final_dataset = pd.concat(frames)\n",
    "final_dataset = final_dataset.drop(['index'], axis=1)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ref : https://stackoverflow.com/questions/43777243/how-to-split-a-dataframe-in-pandas-in-predefined-percentages\n",
    "\n",
    "# 70% train dataset\n",
    "# train_frames = [muslim_df.sample(frac=0.7), sanghi_df.sample(frac=0.7).reset_index(drop=True)]\n",
    "train_frames = [muslim_df, sanghi_df]\n",
    "train_dataset = pd.concat(train_frames)\n",
    "train_dataset['content'] = train_dataset['content'].astype(str)\n",
    "\n",
    "\n",
    "# 30% test dataset\n",
    "remaining_frames = [muslim_df.drop(muslim_df.sample(frac=0.7).index), sanghi_df.drop(sanghi_df.sample(frac=0.7).index).reset_index(drop=True)]\n",
    "test_dataset = pd.concat(remaining_frames)\n",
    "# test_dataset['content'] = train_dataset['content'].astype(str)\n",
    "# test_dataset\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-03-22 14:25:54.140036: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2024-03-22 14:25:58.302374: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
     ]
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import re\n",
    "import shutil\n",
    "import string\n",
    "import tensorflow as tf\n",
    "from keras import layers\n",
    "from keras import losses\n",
    "import keras\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenization\n",
    "tokenizer = Tokenizer()\n",
    "tokenizer.fit_on_texts(train_dataset['content'])\n",
    "\n",
    "vocab_size = len(tokenizer.word_index) + 1\n",
    "max_length = 100  # adjust as needed\n",
    "embedding_dim = 50  # adjust as needed\n",
    "\n",
    "X = tokenizer.texts_to_sequences(train_dataset['content'])\n",
    "X = pad_sequences(X, maxlen=max_length, padding='post')\n",
    "\n",
    "# Encode labels\n",
    "labels = train_dataset['category'].map({'MUSLIM': '1', 'SANGHI': 0}).astype('float32').values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/xd/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/keras/src/layers/core/embedding.py:81: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
      "  super().__init__(**kwargs)\n",
      "2024-03-21 20:20:29.968292: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2024-03-21 20:20:33.617785: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2024-03-21 20:20:33.618176: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2024-03-21 20:20:33.620147: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2024-03-21 20:20:33.620655: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2024-03-21 20:20:33.620962: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2024-03-21 20:20:33.824877: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2024-03-21 20:20:33.825245: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2024-03-21 20:20:33.825539: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
      "2024-03-21 20:20:33.841867: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1754 MB memory:  -> device: 0, name: NVIDIA GeForce 920MX, pci bus id: 0000:01:00.0, compute capability: 5.0\n"
     ]
    }
   ],
   "source": [
    "# build the model\n",
    "model = keras.Sequential([\n",
    "    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),\n",
    "    keras.layers.GlobalAveragePooling1D(),\n",
    "    keras.layers.Dense(16, activation='relu'),\n",
    "    keras.layers.Dense(1, activation='sigmoid')\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compile the model\n",
    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split into train and test datasets\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/10\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
      "I0000 00:00:1711032641.198169   31002 service.cc:145] XLA service 0x7fa54c006910 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
      "I0000 00:00:1711032641.198244   31002 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce 920MX, Compute Capability 5.0\n",
      "2024-03-21 20:20:41.877803: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
      "2024-03-21 20:20:42.791103: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m34/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - accuracy: 0.5261 - loss: 0.6929"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0000 00:00:1711032645.847800   31002 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 71ms/step - accuracy: 0.5337 - loss: 0.6922 - val_accuracy: 0.5514 - val_loss: 0.6814\n",
      "Epoch 2/10\n",
      "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.5740 - loss: 0.6781 - val_accuracy: 0.5563 - val_loss: 0.6636\n",
      "Epoch 3/10\n",
      "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.6603 - loss: 0.6492 - val_accuracy: 0.7781 - val_loss: 0.6244\n",
      "Epoch 4/10\n",
      "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.8210 - loss: 0.5806 - val_accuracy: 0.8328 - val_loss: 0.5476\n",
      "Epoch 5/10\n",
      "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 5ms/step - accuracy: 0.8506 - loss: 0.4952 - val_accuracy: 0.7653 - val_loss: 0.4860\n",
      "Epoch 6/10\n",
      "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.8238 - loss: 0.4224 - val_accuracy: 0.8842 - val_loss: 0.4122\n",
      "Epoch 7/10\n",
      "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.9288 - loss: 0.3257 - val_accuracy: 0.8408 - val_loss: 0.3694\n",
      "Epoch 8/10\n",
      "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.9280 - loss: 0.2705 - val_accuracy: 0.9357 - val_loss: 0.3032\n",
      "Epoch 9/10\n",
      "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.9178 - loss: 0.2446 - val_accuracy: 0.9357 - val_loss: 0.2772\n",
      "Epoch 10/10\n",
      "\u001b[1m46/46\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.9724 - loss: 0.1830 - val_accuracy: 0.8826 - val_loss: 0.2798\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.src.callbacks.history.History at 0x7fa5504b6080>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Train the model\n",
    "model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m20/20\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 4ms/step - accuracy: 0.8722 - loss: 0.2829\n",
      "Test Accuracy: 0.8826366662979126\n"
     ]
    }
   ],
   "source": [
    "# Evaluate the model\n",
    "loss, accuracy = model.evaluate(X_test, y_test)\n",
    "print(\"Test Accuracy:\", accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the model\n",
    "try:\n",
    "  model.save(\"../models/muslim-sanghi-binary-classification.keras\")\n",
    "except FileNotFoundError:\n",
    "  os.mkdir(\"../models\")\n",
    "  model.save(\"../models/muslim-sanghi-binary-classification.keras\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 250ms/step\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.3806811"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use model\n",
    "def preprocess_text(text):\n",
    "  # Tokenize the text\n",
    "  tokenized_text = tokenizer.texts_to_sequences([text])\n",
    "  # Pad sequences to the same length as training data\n",
    "  padded_text = pad_sequences(tokenized_text, maxlen=max_length, padding='post')\n",
    "  return padded_text\n",
    "\n",
    "# load model\n",
    "model = keras.models.load_model(\"../models/muslim-sanghi-binary-classification.keras\")  # Replace \"your_model.h5\" with the path to your trained model\n",
    "\n",
    "# Preprocess the custom input text\n",
    "preprocessed_text = preprocess_text(\"'Kanwariyas are slaves, they're doing slavery' A team member of Pappu shamelessly says it & Pappu happily agrees\")\n",
    "\n",
    "# Make predictions\n",
    "predictions = model.predict(preprocessed_text)\n",
    "\n",
    "predictions[0][0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}