achyut
/

patronizing_detection

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "060994f2",
+   "metadata": {},
+   "source": [
+    "# importing the necessary libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "033ebd27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports - native Python\n",
+    "import collections\n",
+    "import csv\n",
+    "import os\n",
+    "import re\n",
+    "# imports - 3rd party\n",
+    "from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n",
+    "# installs from 🤗\n",
+    "! pip install transformers\n",
+    "! pip install datasets\n",
+    "from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
+    "from datasets import Dataset, DatasetDict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0214c70f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13732b06",
+   "metadata": {},
+   "source": [
+    "# Loading the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5a782b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Using csv instead of pandas for sanity and to do filtering while loading\n",
+    "\n",
+    "# make parallel lists of texts and labels\n",
+    "# texts: strings containing messages\n",
+    "dataset_dict = {'text':[], 'label':[]}\n",
+    "for f in os.listdir():\n",
+    "  # use all .tsv files that have been loaded\n",
+    "  if f.endswith('dontpatronizeme.tsv'):\n",
+    "    with open(f) as tsv_file:\n",
+    "      reader = csv.DictReader(tsv_file, dialect='excel-tab')\n",
+    "      for line in reader:\n",
+    "        text = line['text']\n",
+    "        # a few of the Message fields are empty, so we should skip those ones\n",
+    "        if text!=None and text.strip()!=\"\":\n",
+    "            dataset_dict['text'].append(text)\n",
+    "            dataset_dict['label'].append(int(line['label']))\n",
+    "# huggingface function to convert from dict to their Dataset object\n",
+    "# which will work nicely with their model trainer\n",
+    "ds = Dataset.from_dict(dataset_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "52379811",
+   "metadata": {},
+   "source": [
+    "# Creating train, valid, test splits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6f69bc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# no function to split into train/validation/test so we do 2 separate splits\n",
+    "# first split 80-20 into train and test+validation\n",
+    "train_testvalid = ds.train_test_split(test_size=0.2)\n",
+    "# then split the 20 into 10-10 validation and test\n",
+    "test_valid = train_testvalid['test'].train_test_split(test_size=0.5)\n",
+    "# finally, make the full dataset the 80-10-10 split as a DatasetDict object\n",
+    "train_test_valid_dataset = DatasetDict({\n",
+    "    'train': train_testvalid['train'],\n",
+    "    'test': test_valid['test'],\n",
+    "    'valid': test_valid['train']})\n",
+    "# quick check (if this doesn't pass, will get an error in the tokenization)\n",
+    "# makes sure we filtered the data correcly at the beginning and removed None\n",
+    "for split in train_test_valid_dataset.keys():\n",
+    "  assert not any([x==None for x in train_test_valid_dataset[split]['text']])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0dfcc029",
+   "metadata": {},
+   "source": [
+    "# Tokenizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2cb0082",
+   "metadata": {},
+   "source": [
+    "This is the tokenizer for the distilbert model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65a26dc2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# just use the default tokenizer for the model\n",
+    "tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')\n",
+    "\n",
+    "# simple wrapper\n",
+    "def tokenize(examples, textfield=\"text\"):\n",
+    "  return tokenizer(examples[textfield], padding=\"max_length\", truncation=True)\n",
+    "\n",
+    "# batch tokenization\n",
+    "tokenized_datasets = train_test_valid_dataset.map(tokenize, batched=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "38a15ebb",
+   "metadata": {},
+   "source": [
+    "Below are the examples for also the RoBERTa model and the BERT model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f45cf1d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
+    "\n",
+    "model = AutoModelForMaskedLM.from_pretrained(\"bert-base-uncased\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79d33a06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"roberta-base\")\n",
+    "\n",
+    "model = AutoModelForMaskedLM.from_pretrained(\"roberta-base\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b550e83",
+   "metadata": {},
+   "source": [
+    "# Model "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12c960c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Setup collation\n",
+    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
+    "\n",
+    "# Load model\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4342956",
+   "metadata": {},
+   "source": [
+    "# Computing the metrics and training args"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c974458",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# using sklearn to compute precision, recall, f1, and accuracy\n",
+    "def compute_metrics(pred):\n",
+    "    labels = pred.label_ids\n",
+    "    preds = pred.predictions.argmax(-1)\n",
+    "    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')\n",
+    "    acc = accuracy_score(labels, preds)\n",
+    "    return {\n",
+    "        'accuracy': acc,\n",
+    "        'f1': f1,\n",
+    "        'precision': precision,\n",
+    "        'recall': recall\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c4fb414",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set training args (just using defaults from the following tutorial for now:\n",
+    "# https://huggingface.co/docs/transformers/training )\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./results\",\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=16,\n",
+    "    per_device_eval_batch_size=16,\n",
+    "    num_train_epochs=5,\n",
+    "    weight_decay=0.01,\n",
+    ")\n",
+    "\n",
+    "# setup the trainer\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_datasets[\"train\"],\n",
+    "    eval_dataset=tokenized_datasets[\"valid\"],\n",
+    "    tokenizer=tokenizer,\n",
+    "    data_collator=data_collator,\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb346507",
+   "metadata": {},
+   "source": [
+    "# Train model and Evaluate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de170b1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train the model\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48adbaed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# evaluate on the test set\n",
+    "# should only do for _best_ model of each type \n",
+    "# after selecting hyperparameters that work best on validation set\n",
+    "trainer.evaluate(tokenized_datasets[\"test\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3dea644",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##!pip install huggingface_hub\n",
+    "#!sudo apt-get install fit-lfs\n",
+    "#!huggingface-cli login\n",
+    "#!git clone https://huggingface.co/achyut/patronizing_detection\n",
+    "#cd /content/patronizing_detection"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "539c8683",
+   "metadata": {},
+   "source": [
+    "# LIME for Deep Learning Models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f7c2cab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# LIME importing all the necessary libraries\n",
+    "import numpy as np\n",
+    "import lime\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "from lime.lime_text import LimeTextExplainer\n",
+    "from transformers import AutoTokenizer, AutoModelForSequenceClassification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d53f4b7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set the class names\n",
+    "class_names = ['non-patronizing','patronizing']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d91f290",
+   "metadata": {},
+   "source": [
+    "For LIME and other interpretable AI models, we Have to use the tokenizer and the model of the fine-tuned pretrained model. Not the Huggingface un fine tuned model. That is because we want to use the model with the trained weights, tokens and vocab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2381d7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(\"achyut/patronizing_detection\")\n",
+    "\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\"achyut/patronizing_detection\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "318859d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99a7e69f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install more_itertools\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c810588c",
+   "metadata": {},
+   "source": [
+    "# The function that calculates the logits for each sequence. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3db6441",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import more_itertools\n",
+    "def predictor4(texts, batch_size=64):\n",
+    "  probas = []\n",
+    "  for chunk in more_itertools.chunked(texts, batch_size):\n",
+    "      tokenized = tokenizer(chunk, return_tensors=\"pt\", padding=True)\n",
+    "      outputs = model(tokenized['input_ids'].to('cuda'), tokenized['attention_mask'].to('cuda'))\n",
+    "      probas.append(F.softmax(outputs.logits).cpu().detach().numpy())\n",
+    "  return np.vstack(probas)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1074572d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictor4([\"I have two dogs\",\"The keep barking\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "661d8281",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "explainer = LimeTextExplainer(class_names=class_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abb9b201",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "str_to_predict = ds[6]['text']\n",
+    "exp = explainer.explain_instance(str_to_predict, predictor4, num_features= 25, num_samples = 2000)\n",
+    "exp.show_in_notebook(text=str_to_predict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1885619b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exp.as_list()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f004287",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "42dfbb84",
+   "metadata": {},
+   "source": [
+    "# classical Machine Learning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94835013",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import collections\n",
+    "import csv\n",
+    "import os\n",
+    "import re\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from collections import defaultdict\n",
+    "from nltk.corpus import wordnet as wn\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn import model_selection, naive_bayes, svm\n",
+    "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score\n",
+    "from nltk import pos_tag\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem import WordNetLemmatizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8605ed57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We can use a seed if we want reproducibility\n",
+    "#np.random.seed(500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5475808d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "nltk.download('wordnet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3745eee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "nltk.download('averaged_perceptron_tagger')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "180f42bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Corpus = pd.read_csv(\"patro_downsampled.csv\", names = ['text','label'])\n",
+    "# change it to str, lower case and drop the na values\n",
+    "Corpus.text = Corpus.text.astype(str)\n",
+    "Corpus['text'] = Corpus['text'].str.lower()\n",
+    "Corpus = Corpus.dropna()\n",
+    "Corpus.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f9d00c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Corpus.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "659d463e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#tokenizing our para text column here\n",
+    "Corpus['text'] = Corpus['text'].apply(nltk.word_tokenize)\n",
+    "\n",
+    "# Tagging to understand if the word is a noun, verb, adverb etc\n",
+    "\n",
+    "tag_map = defaultdict(lambda : wn.NOUN)\n",
+    "tag_map['J'] = wn.ADJ\n",
+    "tag_map['V'] = wn.VERB\n",
+    "tag_map['R'] = wn.ADV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5af9ea94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for index,entry in enumerate(Corpus['text']):\n",
+    "    # empty list which I will append to the df in the end.\n",
+    "    Final_words = []\n",
+    "    \n",
+    "    word_Lemmatized = WordNetLemmatizer()\n",
+    "    for word, tag in pos_tag(entry):\n",
+    "        # check for Stop words and consider only alphabets\n",
+    "        if word not in stopwords.words('english') and word.isalpha():\n",
+    "            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])\n",
+    "            Final_words.append(word_Final)\n",
+    "    # The final processed set of words for each iteration will be stored in 'text_final'\n",
+    "    Corpus.loc[index,'text_final'] = str(Final_words)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c6d9bc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Corpus.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f654c4ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Train, test split\n",
+    "Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],\n",
+    "                                                                    Corpus['label'],\n",
+    "                                                                    test_size=0.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00747dbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Encoding our labels\n",
+    "Encoder = LabelEncoder()\n",
+    "Train_Y = Encoder.fit_transform(Train_Y)\n",
+    "Test_Y = Encoder.fit_transform(Test_Y)\n",
+    "\n",
+    "# Vectorizer\n",
+    "Tfidf_vect = TfidfVectorizer()\n",
+    "\n",
+    "Tfidf_vect.fit(Corpus['text_final'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95b89126",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Transforming the train and test inputs into vectors\n",
+    "Train_X_Tfidf = Tfidf_vect.transform(Train_X)\n",
+    "Test_X_Tfidf = Tfidf_vect.transform(Test_X)\n",
+    "print(len(Tfidf_vect.vocabulary_))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1da1f215",
+   "metadata": {},
+   "source": [
+    "# Fitting Models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8d618cd",
+   "metadata": {},
+   "source": [
+    "## NaiveBayes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7613821b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fit the NB classifier\n",
+    "Naive = naive_bayes.MultinomialNB()\n",
+    "naive_model = Naive.fit(Train_X_Tfidf,Train_Y)\n",
+    "predictions_NB = Naive.predict(Test_X_Tfidf)\n",
+    "print(\"Naive Bayes Accuracy Score -> \",accuracy_score(predictions_NB, Test_Y)*100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d04b0813",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f1_score(predictions_NB, Test_Y),precision_score(predictions_NB, Test_Y),recall_score(predictions_NB, Test_Y))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "539cb258",
+   "metadata": {},
+   "source": [
+    "## SVM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf9ebed3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#SVM classifier\n",
+    "SVM = svm.SVC(C=2.0, kernel='poly',degree=2, gamma='scale')\n",
+    "svm_model = SVM.fit(Train_X_Tfidf,Train_Y)\n",
+    "predictions_SVM = SVM.predict(Test_X_Tfidf)\n",
+    "print(\"SVM Accuracy Score -> \",accuracy_score(predictions_SVM, Test_Y)*100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fbf3e41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f1_score(predictions_SVM, Test_Y),precision_score(predictions_SVM, Test_Y),recall_score(predictions_SVM, Test_Y))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81cf1425",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scores = cross_val_score(SVM,Train_X_Tfidf,Train_Y, cv = 5 , scoring = 'f1_macro')\n",
+    "scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5a07117",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scores = cross_val_score(SVM,Train_X_Tfidf,Train_Y, cv = 10 , scoring = 'f1_macro')\n",
+    "scores"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a4dea60f",
+   "metadata": {},
+   "source": [
+    "## Logistic Regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c96b88d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "logisticReg = LogisticRegression()\n",
+    "logisticReg.fit(Train_X_Tfidf,Train_Y)\n",
+    "predictions_LR = logisticReg.predict(Test_X_Tfidf)\n",
+    "print(\"LR Accuracy Score -> \",accuracy_score(predictions_LR, Test_Y)*100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47750ca0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f1_score(predictions_LR, Test_Y), precision_score(predictions_LR, Test_Y),recall_score(predictions_LR, Test_Y))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "75efc6b3",
+   "metadata": {},
+   "source": [
+    "## RandomForest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "144104e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Apply random forest on the data\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "randomForest = RandomForestClassifier(n_estimators = 50) \n",
+    "randomForest.fit(Train_X_Tfidf,Train_Y)\n",
+    "predictions_RF = logisticReg.predict(Test_X_Tfidf)\n",
+    "print(\"LR Accuracy Score -> \",accuracy_score(predictions_RF, Test_Y)*100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f083f5e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f1_score(predictions_RF, Test_Y),precision_score(predictions_RF, Test_Y),recall_score(predictions_RF, Test_Y))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03fb7cc8",
+   "metadata": {},
+   "source": [
+    "# LIME for classical ML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41fa18be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lime\n",
+    "import sklearn.ensemble\n",
+    "from __future__ import print_function\n",
+    "from lime import lime_text\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from lime.lime_text import LimeTextExplainer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d952eb5d",
+   "metadata": {},
+   "source": [
+    "## Make the pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f96a244e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = make_pipeline(Tfidf_vect, logisticred_model)\n",
+    "ls_X_test= list(Test_X)\n",
+    "class_names = {0: 'patro', 1:'non-patro'}\n",
+    "LIME_explainer = LimeTextExplainer(class_names=class_names)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0a727a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx = 15\n",
+    "LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1755fc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('Document id: %d' % idx)\n",
+    "print('Text: ', ls_X_test[idx])\n",
+    "print('Probability =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])\n",
+    "print('True class: %s' % class_names.get(list(Test_Y)[idx]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78b0d22e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"1 = non-Patro class, 0 = Patro class\")\n",
+    "# show the explainability results with highlighted text\n",
+    "LIME_exp.show_in_notebook(text=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3e16b80",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx = 45\n",
+    "LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)\n",
+    "print('Document id: %d' % idx)\n",
+    "print('Text: ', ls_X_test[idx])\n",
+    "print('Probability =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])\n",
+    "print('True class: %s' % class_names.get(list(Test_Y)[idx]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd8e838a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"1 = non-Patro class, 0 = Patro class\")\n",
+    "# show the explainability results with highlighted text\n",
+    "LIME_exp.show_in_notebook(text=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8f07e74",
+   "metadata": {},
+   "source": [
+    "# Topic Modeling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2825b328",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np \n",
+    "import re\n",
+    "from wordcloud import WordCloud\n",
+    "import gensim\n",
+    "from gensim.utils import simple_preprocess\n",
+    "from nltk.corpus import stopwords\n",
+    "import gensim.corpora as corpora\n",
+    "from pprint import pprint\n",
+    "import pyLDAvis.gensim_models\n",
+    "import pickle\n",
+    "import pyLDAvis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71ab6908",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"dontpatronizeme.csv\", names = ['Message','label'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c4a0602",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"Message_processed\"] = df[\"Message\"].map(lambda x: re.sub('[,\\.!?]', '', str(x)))\n",
+    "df['Message_processed'] = df['Message_processed'].map(lambda x: x.lower())\n",
+    "df['Message_processed'].head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e507f49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "long_string = ','.join(list(df['Message_processed'].values))# Create a WordCloud object\n",
+    "wordcloud = WordCloud(background_color=\"white\", max_words=5000, contour_width=3, contour_color='steelblue')# Generate a word cloud\n",
+    "wordcloud.generate(long_string)# Visualize the word cloud\n",
+    "wordcloud.to_image()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76a3f280",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stop_words = stopwords.words('english')\n",
+    "stop_words.extend(['from', 'subject', 're', 'edu', 'use'])\n",
+    "def sent_to_words(sentences):\n",
+    "    for sentence in sentences:\n",
+    "        # deacc=True removes punctuations\n",
+    "        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n",
+    "        \n",
+    "def remove_stopwords(texts):\n",
+    "    return [[word for word in simple_preprocess(str(doc)) \n",
+    "             if word not in stop_words] for doc in texts]\n",
+    "data = df.Message_processed.values.tolist()\n",
+    "data_words = list(sent_to_words(data))# remove stop words\n",
+    "data_words = remove_stopwords(data_words)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e257cc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(data_words[:1][0][:30])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98c5203f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "id2word = corpora.Dictionary(data_words)\n",
+    "texts = data_words# Term Document Frequency\n",
+    "corpus = [id2word.doc2bow(text) for text in texts]# View\n",
+    "print(corpus[:1][0][:30])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4a35025",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_topics = 10# Build LDA model\n",
+    "lda_model = gensim.models.LdaMulticore(corpus=corpus,\n",
+    "                                       id2word=id2word,\n",
+    "                                       num_topics=num_topics)\n",
+    "# Print the Keyword in the 10 topics\n",
+    "pprint(lda_model.print_topics())\n",
+    "doc_lda = lda_model[corpus]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00346a62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pyLDAvis.enable_notebook()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6f7889b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds=\"mmds\", R=30)\n",
+    "vis\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4b7ca16",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b214796",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7f8e54c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "021b015f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab1a9490",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0da95a15",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22c069c0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c02c30f3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cdde3ad",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "717270ef",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25a8f105",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}