{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "060994f2",
   "metadata": {},
   "source": [
    "# importing the necessary libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "033ebd27",
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports - native Python\n",
    "import collections\n",
    "import csv\n",
    "import os\n",
    "import re\n",
    "# imports - 3rd party\n",
    "from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n",
    "# installs from 🤗\n",
    "! pip install transformers\n",
    "! pip install datasets\n",
    "from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
    "from datasets import Dataset, DatasetDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0214c70f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "torch.cuda.empty_cache()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13732b06",
   "metadata": {},
   "source": [
    "# Loading the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5a782b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using csv instead of pandas for sanity and to do filtering while loading\n",
    "\n",
    "# make parallel lists of texts and labels\n",
    "# texts: strings containing messages\n",
    "dataset_dict = {'text':[], 'label':[]}\n",
    "for f in os.listdir():\n",
    "  # use all .tsv files that have been loaded\n",
    "  if f.endswith('dontpatronizeme.tsv'):\n",
    "    with open(f) as tsv_file:\n",
    "      reader = csv.DictReader(tsv_file, dialect='excel-tab')\n",
    "      for line in reader:\n",
    "        text = line['text']\n",
    "        # a few of the Message fields are empty, so we should skip those ones\n",
    "        if text!=None and text.strip()!=\"\":\n",
    "            dataset_dict['text'].append(text)\n",
    "            dataset_dict['label'].append(int(line['label']))\n",
    "# huggingface function to convert from dict to their Dataset object\n",
    "# which will work nicely with their model trainer\n",
    "ds = Dataset.from_dict(dataset_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52379811",
   "metadata": {},
   "source": [
    "# Creating train, valid, test splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6f69bc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# no function to split into train/validation/test so we do 2 separate splits\n",
    "# first split 80-20 into train and test+validation\n",
    "train_testvalid = ds.train_test_split(test_size=0.2)\n",
    "# then split the 20 into 10-10 validation and test\n",
    "test_valid = train_testvalid['test'].train_test_split(test_size=0.5)\n",
    "# finally, make the full dataset the 80-10-10 split as a DatasetDict object\n",
    "train_test_valid_dataset = DatasetDict({\n",
    "    'train': train_testvalid['train'],\n",
    "    'test': test_valid['test'],\n",
    "    'valid': test_valid['train']})\n",
    "# quick check (if this doesn't pass, will get an error in the tokenization)\n",
    "# makes sure we filtered the data correcly at the beginning and removed None\n",
    "for split in train_test_valid_dataset.keys():\n",
    "  assert not any([x==None for x in train_test_valid_dataset[split]['text']])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0dfcc029",
   "metadata": {},
   "source": [
    "# Tokenizer"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2cb0082",
   "metadata": {},
   "source": [
    "This is the tokenizer for the distilbert model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65a26dc2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# just use the default tokenizer for the model\n",
    "tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')\n",
    "\n",
    "# simple wrapper\n",
    "def tokenize(examples, textfield=\"text\"):\n",
    "  return tokenizer(examples[textfield], padding=\"max_length\", truncation=True)\n",
    "\n",
    "# batch tokenization\n",
    "tokenized_datasets = train_test_valid_dataset.map(tokenize, batched=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38a15ebb",
   "metadata": {},
   "source": [
    "Below are the examples for also the RoBERTa model and the BERT model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f45cf1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
    "\n",
    "model = AutoModelForMaskedLM.from_pretrained(\"bert-base-uncased\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79d33a06",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"roberta-base\")\n",
    "\n",
    "model = AutoModelForMaskedLM.from_pretrained(\"roberta-base\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b550e83",
   "metadata": {},
   "source": [
    "# Model "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12c960c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup collation\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
    "\n",
    "# Load model\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4342956",
   "metadata": {},
   "source": [
    "# Computing the metrics and training args"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c974458",
   "metadata": {},
   "outputs": [],
   "source": [
    "# using sklearn to compute precision, recall, f1, and accuracy\n",
    "def compute_metrics(pred):\n",
    "    labels = pred.label_ids\n",
    "    preds = pred.predictions.argmax(-1)\n",
    "    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')\n",
    "    acc = accuracy_score(labels, preds)\n",
    "    return {\n",
    "        'accuracy': acc,\n",
    "        'f1': f1,\n",
    "        'precision': precision,\n",
    "        'recall': recall\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c4fb414",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set training args (just using defaults from the following tutorial for now:\n",
    "# https://huggingface.co/docs/transformers/training )\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./results\",\n",
    "    learning_rate=2e-5,\n",
    "    per_device_train_batch_size=16,\n",
    "    per_device_eval_batch_size=16,\n",
    "    num_train_epochs=5,\n",
    "    weight_decay=0.01,\n",
    ")\n",
    "\n",
    "# setup the trainer\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"valid\"],\n",
    "    tokenizer=tokenizer,\n",
    "    data_collator=data_collator,\n",
    "    compute_metrics=compute_metrics,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cb346507",
   "metadata": {},
   "source": [
    "# Train model and Evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de170b1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# train the model\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48adbaed",
   "metadata": {},
   "outputs": [],
   "source": [
    "# evaluate on the test set\n",
    "# should only do for _best_ model of each type \n",
    "# after selecting hyperparameters that work best on validation set\n",
    "trainer.evaluate(tokenized_datasets[\"test\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3dea644",
   "metadata": {},
   "outputs": [],
   "source": [
    "##!pip install huggingface_hub\n",
    "#!sudo apt-get install fit-lfs\n",
    "#!huggingface-cli login\n",
    "#!git clone https://huggingface.co/achyut/patronizing_detection\n",
    "#cd /content/patronizing_detection"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "539c8683",
   "metadata": {},
   "source": [
    "# LIME for Deep Learning Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f7c2cab",
   "metadata": {},
   "outputs": [],
   "source": [
    "# LIME importing all the necessary libraries\n",
    "import numpy as np\n",
    "import lime\n",
    "import torch\n",
    "import torch.nn.functional as F\n",
    "from lime.lime_text import LimeTextExplainer\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d53f4b7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the class names\n",
    "class_names = ['non-patronizing','patronizing']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d91f290",
   "metadata": {},
   "source": [
    "For LIME and other interpretable AI models, we Have to use the tokenizer and the model of the fine-tuned pretrained model. Not the Huggingface un fine tuned model. That is because we want to use the model with the trained weights, tokens and vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2381d7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(\"achyut/patronizing_detection\")\n",
    "\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\"achyut/patronizing_detection\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "318859d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99a7e69f",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install more_itertools\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c810588c",
   "metadata": {},
   "source": [
    "# The function that calculates the logits for each sequence. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3db6441",
   "metadata": {},
   "outputs": [],
   "source": [
    "import more_itertools\n",
    "def predictor4(texts, batch_size=64):\n",
    "  probas = []\n",
    "  for chunk in more_itertools.chunked(texts, batch_size):\n",
    "      tokenized = tokenizer(chunk, return_tensors=\"pt\", padding=True)\n",
    "      outputs = model(tokenized['input_ids'].to('cuda'), tokenized['attention_mask'].to('cuda'))\n",
    "      probas.append(F.softmax(outputs.logits).cpu().detach().numpy())\n",
    "  return np.vstack(probas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1074572d",
   "metadata": {},
   "outputs": [],
   "source": [
    "predictor4([\"I have two dogs\",\"The keep barking\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "661d8281",
   "metadata": {},
   "outputs": [],
   "source": [
    "explainer = LimeTextExplainer(class_names=class_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abb9b201",
   "metadata": {},
   "outputs": [],
   "source": [
    "str_to_predict = ds[6]['text']\n",
    "exp = explainer.explain_instance(str_to_predict, predictor4, num_features= 25, num_samples = 2000)\n",
    "exp.show_in_notebook(text=str_to_predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1885619b",
   "metadata": {},
   "outputs": [],
   "source": [
    "exp.as_list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f004287",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "42dfbb84",
   "metadata": {},
   "source": [
    "# classical Machine Learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94835013",
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "import csv\n",
    "import os\n",
    "import re\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from nltk.tokenize import word_tokenize\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from collections import defaultdict\n",
    "from nltk.corpus import wordnet as wn\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn import model_selection, naive_bayes, svm\n",
    "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score\n",
    "from nltk import pos_tag\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8605ed57",
   "metadata": {},
   "outputs": [],
   "source": [
    "# We can use a seed if we want reproducibility\n",
    "#np.random.seed(500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5475808d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3745eee",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('averaged_perceptron_tagger')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "180f42bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "Corpus = pd.read_csv(\"patro_downsampled.csv\", names = ['text','label'])\n",
    "# change it to str, lower case and drop the na values\n",
    "Corpus.text = Corpus.text.astype(str)\n",
    "Corpus['text'] = Corpus['text'].str.lower()\n",
    "Corpus = Corpus.dropna()\n",
    "Corpus.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f9d00c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "Corpus.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "659d463e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#tokenizing our para text column here\n",
    "Corpus['text'] = Corpus['text'].apply(nltk.word_tokenize)\n",
    "\n",
    "# Tagging to understand if the word is a noun, verb, adverb etc\n",
    "\n",
    "tag_map = defaultdict(lambda : wn.NOUN)\n",
    "tag_map['J'] = wn.ADJ\n",
    "tag_map['V'] = wn.VERB\n",
    "tag_map['R'] = wn.ADV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5af9ea94",
   "metadata": {},
   "outputs": [],
   "source": [
    "for index,entry in enumerate(Corpus['text']):\n",
    "    # empty list which I will append to the df in the end.\n",
    "    Final_words = []\n",
    "    \n",
    "    word_Lemmatized = WordNetLemmatizer()\n",
    "    for word, tag in pos_tag(entry):\n",
    "        # check for Stop words and consider only alphabets\n",
    "        if word not in stopwords.words('english') and word.isalpha():\n",
    "            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])\n",
    "            Final_words.append(word_Final)\n",
    "    # The final processed set of words for each iteration will be stored in 'text_final'\n",
    "    Corpus.loc[index,'text_final'] = str(Final_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c6d9bc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "Corpus.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f654c4ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Train, test split\n",
    "Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],\n",
    "                                                                    Corpus['label'],\n",
    "                                                                    test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00747dbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Encoding our labels\n",
    "Encoder = LabelEncoder()\n",
    "Train_Y = Encoder.fit_transform(Train_Y)\n",
    "Test_Y = Encoder.fit_transform(Test_Y)\n",
    "\n",
    "# Vectorizer\n",
    "Tfidf_vect = TfidfVectorizer()\n",
    "\n",
    "Tfidf_vect.fit(Corpus['text_final'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95b89126",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Transforming the train and test inputs into vectors\n",
    "Train_X_Tfidf = Tfidf_vect.transform(Train_X)\n",
    "Test_X_Tfidf = Tfidf_vect.transform(Test_X)\n",
    "print(len(Tfidf_vect.vocabulary_))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1da1f215",
   "metadata": {},
   "source": [
    "# Fitting Models"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b8d618cd",
   "metadata": {},
   "source": [
    "## NaiveBayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7613821b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# fit the NB classifier\n",
    "Naive = naive_bayes.MultinomialNB()\n",
    "naive_model = Naive.fit(Train_X_Tfidf,Train_Y)\n",
    "predictions_NB = Naive.predict(Test_X_Tfidf)\n",
    "print(\"Naive Bayes Accuracy Score -> \",accuracy_score(predictions_NB, Test_Y)*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d04b0813",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f1_score(predictions_NB, Test_Y),precision_score(predictions_NB, Test_Y),recall_score(predictions_NB, Test_Y))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "539cb258",
   "metadata": {},
   "source": [
    "## SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf9ebed3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#SVM classifier\n",
    "SVM = svm.SVC(C=2.0, kernel='poly',degree=2, gamma='scale')\n",
    "svm_model = SVM.fit(Train_X_Tfidf,Train_Y)\n",
    "predictions_SVM = SVM.predict(Test_X_Tfidf)\n",
    "print(\"SVM Accuracy Score -> \",accuracy_score(predictions_SVM, Test_Y)*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fbf3e41",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f1_score(predictions_SVM, Test_Y),precision_score(predictions_SVM, Test_Y),recall_score(predictions_SVM, Test_Y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81cf1425",
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = cross_val_score(SVM,Train_X_Tfidf,Train_Y, cv = 5 , scoring = 'f1_macro')\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5a07117",
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = cross_val_score(SVM,Train_X_Tfidf,Train_Y, cv = 10 , scoring = 'f1_macro')\n",
    "scores"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4dea60f",
   "metadata": {},
   "source": [
    "## Logistic Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c96b88d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "logisticReg = LogisticRegression()\n",
    "logisticReg.fit(Train_X_Tfidf,Train_Y)\n",
    "predictions_LR = logisticReg.predict(Test_X_Tfidf)\n",
    "print(\"LR Accuracy Score -> \",accuracy_score(predictions_LR, Test_Y)*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47750ca0",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f1_score(predictions_LR, Test_Y), precision_score(predictions_LR, Test_Y),recall_score(predictions_LR, Test_Y))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "75efc6b3",
   "metadata": {},
   "source": [
    "## RandomForest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "144104e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply random forest on the data\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "randomForest = RandomForestClassifier(n_estimators = 50) \n",
    "randomForest.fit(Train_X_Tfidf,Train_Y)\n",
    "predictions_RF = logisticReg.predict(Test_X_Tfidf)\n",
    "print(\"LR Accuracy Score -> \",accuracy_score(predictions_RF, Test_Y)*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f083f5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f1_score(predictions_RF, Test_Y),precision_score(predictions_RF, Test_Y),recall_score(predictions_RF, Test_Y))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03fb7cc8",
   "metadata": {},
   "source": [
    "# LIME for classical ML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41fa18be",
   "metadata": {},
   "outputs": [],
   "source": [
    "import lime\n",
    "import sklearn.ensemble\n",
    "from __future__ import print_function\n",
    "from lime import lime_text\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from lime.lime_text import LimeTextExplainer"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d952eb5d",
   "metadata": {},
   "source": [
    "## Make the pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f96a244e",
   "metadata": {},
   "outputs": [],
   "source": [
    "c = make_pipeline(Tfidf_vect, logisticred_model)\n",
    "ls_X_test= list(Test_X)\n",
    "class_names = {0: 'patro', 1:'non-patro'}\n",
    "LIME_explainer = LimeTextExplainer(class_names=class_names)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0a727a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "idx = 15\n",
    "LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1755fc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Document id: %d' % idx)\n",
    "print('Text: ', ls_X_test[idx])\n",
    "print('Probability =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])\n",
    "print('True class: %s' % class_names.get(list(Test_Y)[idx]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78b0d22e",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"1 = non-Patro class, 0 = Patro class\")\n",
    "# show the explainability results with highlighted text\n",
    "LIME_exp.show_in_notebook(text=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3e16b80",
   "metadata": {},
   "outputs": [],
   "source": [
    "idx = 45\n",
    "LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)\n",
    "print('Document id: %d' % idx)\n",
    "print('Text: ', ls_X_test[idx])\n",
    "print('Probability =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])\n",
    "print('True class: %s' % class_names.get(list(Test_Y)[idx]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd8e838a",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"1 = non-Patro class, 0 = Patro class\")\n",
    "# show the explainability results with highlighted text\n",
    "LIME_exp.show_in_notebook(text=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f8f07e74",
   "metadata": {},
   "source": [
    "# Topic Modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2825b328",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np \n",
    "import re\n",
    "from wordcloud import WordCloud\n",
    "import gensim\n",
    "from gensim.utils import simple_preprocess\n",
    "from nltk.corpus import stopwords\n",
    "import gensim.corpora as corpora\n",
    "from pprint import pprint\n",
    "import pyLDAvis.gensim_models\n",
    "import pickle\n",
    "import pyLDAvis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71ab6908",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"dontpatronizeme.csv\", names = ['Message','label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c4a0602",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"Message_processed\"] = df[\"Message\"].map(lambda x: re.sub('[,\\.!?]', '', str(x)))\n",
    "df['Message_processed'] = df['Message_processed'].map(lambda x: x.lower())\n",
    "df['Message_processed'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e507f49",
   "metadata": {},
   "outputs": [],
   "source": [
    "long_string = ','.join(list(df['Message_processed'].values))# Create a WordCloud object\n",
    "wordcloud = WordCloud(background_color=\"white\", max_words=5000, contour_width=3, contour_color='steelblue')# Generate a word cloud\n",
    "wordcloud.generate(long_string)# Visualize the word cloud\n",
    "wordcloud.to_image()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76a3f280",
   "metadata": {},
   "outputs": [],
   "source": [
    "stop_words = stopwords.words('english')\n",
    "stop_words.extend(['from', 'subject', 're', 'edu', 'use'])\n",
    "def sent_to_words(sentences):\n",
    "    for sentence in sentences:\n",
    "        # deacc=True removes punctuations\n",
    "        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n",
    "        \n",
    "def remove_stopwords(texts):\n",
    "    return [[word for word in simple_preprocess(str(doc)) \n",
    "             if word not in stop_words] for doc in texts]\n",
    "data = df.Message_processed.values.tolist()\n",
    "data_words = list(sent_to_words(data))# remove stop words\n",
    "data_words = remove_stopwords(data_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e257cc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(data_words[:1][0][:30])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98c5203f",
   "metadata": {},
   "outputs": [],
   "source": [
    "id2word = corpora.Dictionary(data_words)\n",
    "texts = data_words# Term Document Frequency\n",
    "corpus = [id2word.doc2bow(text) for text in texts]# View\n",
    "print(corpus[:1][0][:30])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4a35025",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_topics = 10# Build LDA model\n",
    "lda_model = gensim.models.LdaMulticore(corpus=corpus,\n",
    "                                       id2word=id2word,\n",
    "                                       num_topics=num_topics)\n",
    "# Print the Keyword in the 10 topics\n",
    "pprint(lda_model.print_topics())\n",
    "doc_lda = lda_model[corpus]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00346a62",
   "metadata": {},
   "outputs": [],
   "source": [
    "pyLDAvis.enable_notebook()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6f7889b",
   "metadata": {},
   "outputs": [],
   "source": [
    "vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds=\"mmds\", R=30)\n",
    "vis\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4b7ca16",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b214796",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7f8e54c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "021b015f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab1a9490",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0da95a15",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22c069c0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c02c30f3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cdde3ad",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "717270ef",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25a8f105",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}