achyut commited on
Commit
0103f05
1 Parent(s): 1ae9e8b

Upload NotebookPCL.ipynb

Browse files
Files changed (1) hide show
  1. NotebookPCL.ipynb +1163 -0
NotebookPCL.ipynb ADDED
@@ -0,0 +1,1163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "060994f2",
6
+ "metadata": {},
7
+ "source": [
8
+ "# importing the necessary libraries"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "033ebd27",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "# imports - native Python\n",
19
+ "import collections\n",
20
+ "import csv\n",
21
+ "import os\n",
22
+ "import re\n",
23
+ "# imports - 3rd party\n",
24
+ "from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n",
25
+ "# installs from 🤗\n",
26
+ "! pip install transformers\n",
27
+ "! pip install datasets\n",
28
+ "from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
29
+ "from datasets import Dataset, DatasetDict"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "id": "0214c70f",
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "import torch\n",
40
+ "torch.cuda.empty_cache()"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "markdown",
45
+ "id": "13732b06",
46
+ "metadata": {},
47
+ "source": [
48
+ "# Loading the data"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": null,
54
+ "id": "e5a782b3",
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "# Using csv instead of pandas for sanity and to do filtering while loading\n",
59
+ "\n",
60
+ "# make parallel lists of texts and labels\n",
61
+ "# texts: strings containing messages\n",
62
+ "dataset_dict = {'text':[], 'label':[]}\n",
63
+ "for f in os.listdir():\n",
64
+ " # use all .tsv files that have been loaded\n",
65
+ " if f.endswith('dontpatronizeme.tsv'):\n",
66
+ " with open(f) as tsv_file:\n",
67
+ " reader = csv.DictReader(tsv_file, dialect='excel-tab')\n",
68
+ " for line in reader:\n",
69
+ " text = line['text']\n",
70
+ " # a few of the Message fields are empty, so we should skip those ones\n",
71
+ " if text!=None and text.strip()!=\"\":\n",
72
+ " dataset_dict['text'].append(text)\n",
73
+ " dataset_dict['label'].append(int(line['label']))\n",
74
+ "# huggingface function to convert from dict to their Dataset object\n",
75
+ "# which will work nicely with their model trainer\n",
76
+ "ds = Dataset.from_dict(dataset_dict)"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "id": "52379811",
82
+ "metadata": {},
83
+ "source": [
84
+ "# Creating train, valid, test splits"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "id": "a6f69bc1",
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "# no function to split into train/validation/test so we do 2 separate splits\n",
95
+ "# first split 80-20 into train and test+validation\n",
96
+ "train_testvalid = ds.train_test_split(test_size=0.2)\n",
97
+ "# then split the 20 into 10-10 validation and test\n",
98
+ "test_valid = train_testvalid['test'].train_test_split(test_size=0.5)\n",
99
+ "# finally, make the full dataset the 80-10-10 split as a DatasetDict object\n",
100
+ "train_test_valid_dataset = DatasetDict({\n",
101
+ " 'train': train_testvalid['train'],\n",
102
+ " 'test': test_valid['test'],\n",
103
+ " 'valid': test_valid['train']})\n",
104
+ "# quick check (if this doesn't pass, will get an error in the tokenization)\n",
105
+ "# makes sure we filtered the data correcly at the beginning and removed None\n",
106
+ "for split in train_test_valid_dataset.keys():\n",
107
+ " assert not any([x==None for x in train_test_valid_dataset[split]['text']])"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "markdown",
112
+ "id": "0dfcc029",
113
+ "metadata": {},
114
+ "source": [
115
+ "# Tokenizer"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "markdown",
120
+ "id": "b2cb0082",
121
+ "metadata": {},
122
+ "source": [
123
+ "This is the tokenizer for the distilbert model"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": null,
129
+ "id": "65a26dc2",
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": [
133
+ "# just use the default tokenizer for the model\n",
134
+ "tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')\n",
135
+ "\n",
136
+ "# simple wrapper\n",
137
+ "def tokenize(examples, textfield=\"text\"):\n",
138
+ " return tokenizer(examples[textfield], padding=\"max_length\", truncation=True)\n",
139
+ "\n",
140
+ "# batch tokenization\n",
141
+ "tokenized_datasets = train_test_valid_dataset.map(tokenize, batched=True)"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "markdown",
146
+ "id": "38a15ebb",
147
+ "metadata": {},
148
+ "source": [
149
+ "Below are the examples for also the RoBERTa model and the BERT model"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": null,
155
+ "id": "8f45cf1d",
156
+ "metadata": {},
157
+ "outputs": [],
158
+ "source": [
159
+ "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
160
+ "\n",
161
+ "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
162
+ "\n",
163
+ "model = AutoModelForMaskedLM.from_pretrained(\"bert-base-uncased\")"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": null,
169
+ "id": "79d33a06",
170
+ "metadata": {},
171
+ "outputs": [],
172
+ "source": [
173
+ "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
174
+ "\n",
175
+ "tokenizer = AutoTokenizer.from_pretrained(\"roberta-base\")\n",
176
+ "\n",
177
+ "model = AutoModelForMaskedLM.from_pretrained(\"roberta-base\")"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "markdown",
182
+ "id": "9b550e83",
183
+ "metadata": {},
184
+ "source": [
185
+ "# Model "
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": null,
191
+ "id": "12c960c0",
192
+ "metadata": {},
193
+ "outputs": [],
194
+ "source": [
195
+ "# Setup collation\n",
196
+ "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
197
+ "\n",
198
+ "# Load model\n",
199
+ "model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "markdown",
204
+ "id": "d4342956",
205
+ "metadata": {},
206
+ "source": [
207
+ "# Computing the metrics and training args"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": null,
213
+ "id": "4c974458",
214
+ "metadata": {},
215
+ "outputs": [],
216
+ "source": [
217
+ "# using sklearn to compute precision, recall, f1, and accuracy\n",
218
+ "def compute_metrics(pred):\n",
219
+ " labels = pred.label_ids\n",
220
+ " preds = pred.predictions.argmax(-1)\n",
221
+ " precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')\n",
222
+ " acc = accuracy_score(labels, preds)\n",
223
+ " return {\n",
224
+ " 'accuracy': acc,\n",
225
+ " 'f1': f1,\n",
226
+ " 'precision': precision,\n",
227
+ " 'recall': recall\n",
228
+ " }"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": null,
234
+ "id": "8c4fb414",
235
+ "metadata": {},
236
+ "outputs": [],
237
+ "source": [
238
+ "# Set training args (just using defaults from the following tutorial for now:\n",
239
+ "# https://huggingface.co/docs/transformers/training )\n",
240
+ "training_args = TrainingArguments(\n",
241
+ " output_dir=\"./results\",\n",
242
+ " learning_rate=2e-5,\n",
243
+ " per_device_train_batch_size=16,\n",
244
+ " per_device_eval_batch_size=16,\n",
245
+ " num_train_epochs=5,\n",
246
+ " weight_decay=0.01,\n",
247
+ ")\n",
248
+ "\n",
249
+ "# setup the trainer\n",
250
+ "trainer = Trainer(\n",
251
+ " model=model,\n",
252
+ " args=training_args,\n",
253
+ " train_dataset=tokenized_datasets[\"train\"],\n",
254
+ " eval_dataset=tokenized_datasets[\"valid\"],\n",
255
+ " tokenizer=tokenizer,\n",
256
+ " data_collator=data_collator,\n",
257
+ " compute_metrics=compute_metrics,\n",
258
+ ")"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "markdown",
263
+ "id": "cb346507",
264
+ "metadata": {},
265
+ "source": [
266
+ "# Train model and Evaluate"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": null,
272
+ "id": "de170b1e",
273
+ "metadata": {},
274
+ "outputs": [],
275
+ "source": [
276
+ "# train the model\n",
277
+ "trainer.train()"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": null,
283
+ "id": "48adbaed",
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "# evaluate on the test set\n",
288
+ "# should only do for _best_ model of each type \n",
289
+ "# after selecting hyperparameters that work best on validation set\n",
290
+ "trainer.evaluate(tokenized_datasets[\"test\"])"
291
+ ]
292
+ },
293
+ {
294
+ "cell_type": "code",
295
+ "execution_count": null,
296
+ "id": "c3dea644",
297
+ "metadata": {},
298
+ "outputs": [],
299
+ "source": [
300
+ "##!pip install huggingface_hub\n",
301
+ "#!sudo apt-get install fit-lfs\n",
302
+ "#!huggingface-cli login\n",
303
+ "#!git clone https://huggingface.co/achyut/patronizing_detection\n",
304
+ "#cd /content/patronizing_detection"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "markdown",
309
+ "id": "539c8683",
310
+ "metadata": {},
311
+ "source": [
312
+ "# LIME for Deep Learning Models"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": null,
318
+ "id": "9f7c2cab",
319
+ "metadata": {},
320
+ "outputs": [],
321
+ "source": [
322
+ "# LIME importing all the necessary libraries\n",
323
+ "import numpy as np\n",
324
+ "import lime\n",
325
+ "import torch\n",
326
+ "import torch.nn.functional as F\n",
327
+ "from lime.lime_text import LimeTextExplainer\n",
328
+ "from transformers import AutoTokenizer, AutoModelForSequenceClassification"
329
+ ]
330
+ },
331
+ {
332
+ "cell_type": "code",
333
+ "execution_count": null,
334
+ "id": "d53f4b7d",
335
+ "metadata": {},
336
+ "outputs": [],
337
+ "source": [
338
+ "# Set the class names\n",
339
+ "class_names = ['non-patronizing','patronizing']"
340
+ ]
341
+ },
342
+ {
343
+ "cell_type": "markdown",
344
+ "id": "2d91f290",
345
+ "metadata": {},
346
+ "source": [
347
+ "For LIME and other interpretable AI models, we Have to use the tokenizer and the model of the fine-tuned pretrained model. Not the Huggingface un fine tuned model. That is because we want to use the model with the trained weights, tokens and vocab"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": null,
353
+ "id": "e2381d7b",
354
+ "metadata": {},
355
+ "outputs": [],
356
+ "source": [
357
+ "tokenizer = AutoTokenizer.from_pretrained(\"achyut/patronizing_detection\")\n",
358
+ "\n",
359
+ "model = AutoModelForSequenceClassification.from_pretrained(\"achyut/patronizing_detection\")"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": null,
365
+ "id": "318859d6",
366
+ "metadata": {},
367
+ "outputs": [],
368
+ "source": [
369
+ "model.cuda()"
370
+ ]
371
+ },
372
+ {
373
+ "cell_type": "code",
374
+ "execution_count": null,
375
+ "id": "99a7e69f",
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": [
379
+ "!pip install more_itertools\n"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "markdown",
384
+ "id": "c810588c",
385
+ "metadata": {},
386
+ "source": [
387
+ "# The function that calculates the logits for each sequence. "
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": null,
393
+ "id": "c3db6441",
394
+ "metadata": {},
395
+ "outputs": [],
396
+ "source": [
397
+ "import more_itertools\n",
398
+ "def predictor4(texts, batch_size=64):\n",
399
+ " probas = []\n",
400
+ " for chunk in more_itertools.chunked(texts, batch_size):\n",
401
+ " tokenized = tokenizer(chunk, return_tensors=\"pt\", padding=True)\n",
402
+ " outputs = model(tokenized['input_ids'].to('cuda'), tokenized['attention_mask'].to('cuda'))\n",
403
+ " probas.append(F.softmax(outputs.logits).cpu().detach().numpy())\n",
404
+ " return np.vstack(probas)"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": null,
410
+ "id": "1074572d",
411
+ "metadata": {},
412
+ "outputs": [],
413
+ "source": [
414
+ "predictor4([\"I have two dogs\",\"The keep barking\"])"
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "code",
419
+ "execution_count": null,
420
+ "id": "661d8281",
421
+ "metadata": {},
422
+ "outputs": [],
423
+ "source": [
424
+ "explainer = LimeTextExplainer(class_names=class_names)"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": null,
430
+ "id": "abb9b201",
431
+ "metadata": {},
432
+ "outputs": [],
433
+ "source": [
434
+ "str_to_predict = ds[6]['text']\n",
435
+ "exp = explainer.explain_instance(str_to_predict, predictor4, num_features= 25, num_samples = 2000)\n",
436
+ "exp.show_in_notebook(text=str_to_predict)"
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "code",
441
+ "execution_count": null,
442
+ "id": "1885619b",
443
+ "metadata": {},
444
+ "outputs": [],
445
+ "source": [
446
+ "exp.as_list()"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": null,
452
+ "id": "5f004287",
453
+ "metadata": {},
454
+ "outputs": [],
455
+ "source": []
456
+ },
457
+ {
458
+ "cell_type": "markdown",
459
+ "id": "42dfbb84",
460
+ "metadata": {},
461
+ "source": [
462
+ "# classical Machine Learning"
463
+ ]
464
+ },
465
+ {
466
+ "cell_type": "code",
467
+ "execution_count": null,
468
+ "id": "94835013",
469
+ "metadata": {},
470
+ "outputs": [],
471
+ "source": [
472
+ "import collections\n",
473
+ "import csv\n",
474
+ "import os\n",
475
+ "import re\n",
476
+ "import pandas as pd\n",
477
+ "import numpy as np\n",
478
+ "from nltk.tokenize import word_tokenize\n",
479
+ "from sklearn.preprocessing import LabelEncoder\n",
480
+ "from collections import defaultdict\n",
481
+ "from nltk.corpus import wordnet as wn\n",
482
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
483
+ "from sklearn import model_selection, naive_bayes, svm\n",
484
+ "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score\n",
485
+ "from nltk import pos_tag\n",
486
+ "from nltk.corpus import stopwords\n",
487
+ "from nltk.stem import WordNetLemmatizer"
488
+ ]
489
+ },
490
+ {
491
+ "cell_type": "code",
492
+ "execution_count": null,
493
+ "id": "8605ed57",
494
+ "metadata": {},
495
+ "outputs": [],
496
+ "source": [
497
+ "# We can use a seed if we want reproducibility\n",
498
+ "#np.random.seed(500)"
499
+ ]
500
+ },
501
+ {
502
+ "cell_type": "code",
503
+ "execution_count": null,
504
+ "id": "5475808d",
505
+ "metadata": {},
506
+ "outputs": [],
507
+ "source": [
508
+ "import nltk\n",
509
+ "nltk.download('wordnet')"
510
+ ]
511
+ },
512
+ {
513
+ "cell_type": "code",
514
+ "execution_count": null,
515
+ "id": "c3745eee",
516
+ "metadata": {},
517
+ "outputs": [],
518
+ "source": [
519
+ "import nltk\n",
520
+ "nltk.download('averaged_perceptron_tagger')"
521
+ ]
522
+ },
523
+ {
524
+ "cell_type": "code",
525
+ "execution_count": null,
526
+ "id": "180f42bf",
527
+ "metadata": {},
528
+ "outputs": [],
529
+ "source": [
530
+ "Corpus = pd.read_csv(\"patro_downsampled.csv\", names = ['text','label'])\n",
531
+ "# change it to str, lower case and drop the na values\n",
532
+ "Corpus.text = Corpus.text.astype(str)\n",
533
+ "Corpus['text'] = Corpus['text'].str.lower()\n",
534
+ "Corpus = Corpus.dropna()\n",
535
+ "Corpus.head()"
536
+ ]
537
+ },
538
+ {
539
+ "cell_type": "code",
540
+ "execution_count": null,
541
+ "id": "5f9d00c8",
542
+ "metadata": {},
543
+ "outputs": [],
544
+ "source": [
545
+ "Corpus.info()"
546
+ ]
547
+ },
548
+ {
549
+ "cell_type": "code",
550
+ "execution_count": null,
551
+ "id": "659d463e",
552
+ "metadata": {},
553
+ "outputs": [],
554
+ "source": [
555
+ "#tokenizing our para text column here\n",
556
+ "Corpus['text'] = Corpus['text'].apply(nltk.word_tokenize)\n",
557
+ "\n",
558
+ "# Tagging to understand if the word is a noun, verb, adverb etc\n",
559
+ "\n",
560
+ "tag_map = defaultdict(lambda : wn.NOUN)\n",
561
+ "tag_map['J'] = wn.ADJ\n",
562
+ "tag_map['V'] = wn.VERB\n",
563
+ "tag_map['R'] = wn.ADV"
564
+ ]
565
+ },
566
+ {
567
+ "cell_type": "code",
568
+ "execution_count": null,
569
+ "id": "5af9ea94",
570
+ "metadata": {},
571
+ "outputs": [],
572
+ "source": [
573
+ "for index,entry in enumerate(Corpus['text']):\n",
574
+ " # empty list which I will append to the df in the end.\n",
575
+ " Final_words = []\n",
576
+ " \n",
577
+ " word_Lemmatized = WordNetLemmatizer()\n",
578
+ " for word, tag in pos_tag(entry):\n",
579
+ " # check for Stop words and consider only alphabets\n",
580
+ " if word not in stopwords.words('english') and word.isalpha():\n",
581
+ " word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])\n",
582
+ " Final_words.append(word_Final)\n",
583
+ " # The final processed set of words for each iteration will be stored in 'text_final'\n",
584
+ " Corpus.loc[index,'text_final'] = str(Final_words)"
585
+ ]
586
+ },
587
+ {
588
+ "cell_type": "code",
589
+ "execution_count": null,
590
+ "id": "8c6d9bc6",
591
+ "metadata": {},
592
+ "outputs": [],
593
+ "source": [
594
+ "Corpus.head()"
595
+ ]
596
+ },
597
+ {
598
+ "cell_type": "code",
599
+ "execution_count": null,
600
+ "id": "f654c4ab",
601
+ "metadata": {},
602
+ "outputs": [],
603
+ "source": [
604
+ "#Train, test split\n",
605
+ "Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],\n",
606
+ " Corpus['label'],\n",
607
+ " test_size=0.2)"
608
+ ]
609
+ },
610
+ {
611
+ "cell_type": "code",
612
+ "execution_count": null,
613
+ "id": "00747dbd",
614
+ "metadata": {},
615
+ "outputs": [],
616
+ "source": [
617
+ "#Encoding our labels\n",
618
+ "Encoder = LabelEncoder()\n",
619
+ "Train_Y = Encoder.fit_transform(Train_Y)\n",
620
+ "Test_Y = Encoder.fit_transform(Test_Y)\n",
621
+ "\n",
622
+ "# Vectorizer\n",
623
+ "Tfidf_vect = TfidfVectorizer()\n",
624
+ "\n",
625
+ "Tfidf_vect.fit(Corpus['text_final'])"
626
+ ]
627
+ },
628
+ {
629
+ "cell_type": "code",
630
+ "execution_count": null,
631
+ "id": "95b89126",
632
+ "metadata": {},
633
+ "outputs": [],
634
+ "source": [
635
+ "# Transforming the train and test inputs into vectors\n",
636
+ "Train_X_Tfidf = Tfidf_vect.transform(Train_X)\n",
637
+ "Test_X_Tfidf = Tfidf_vect.transform(Test_X)\n",
638
+ "print(len(Tfidf_vect.vocabulary_))"
639
+ ]
640
+ },
641
+ {
642
+ "cell_type": "markdown",
643
+ "id": "1da1f215",
644
+ "metadata": {},
645
+ "source": [
646
+ "# Fitting Models"
647
+ ]
648
+ },
649
+ {
650
+ "cell_type": "markdown",
651
+ "id": "b8d618cd",
652
+ "metadata": {},
653
+ "source": [
654
+ "## NaiveBayes"
655
+ ]
656
+ },
657
+ {
658
+ "cell_type": "code",
659
+ "execution_count": null,
660
+ "id": "7613821b",
661
+ "metadata": {},
662
+ "outputs": [],
663
+ "source": [
664
+ "# fit the NB classifier\n",
665
+ "Naive = naive_bayes.MultinomialNB()\n",
666
+ "naive_model = Naive.fit(Train_X_Tfidf,Train_Y)\n",
667
+ "predictions_NB = Naive.predict(Test_X_Tfidf)\n",
668
+ "print(\"Naive Bayes Accuracy Score -> \",accuracy_score(predictions_NB, Test_Y)*100)"
669
+ ]
670
+ },
671
+ {
672
+ "cell_type": "code",
673
+ "execution_count": null,
674
+ "id": "d04b0813",
675
+ "metadata": {},
676
+ "outputs": [],
677
+ "source": [
678
+ "print(f1_score(predictions_NB, Test_Y),precision_score(predictions_NB, Test_Y),recall_score(predictions_NB, Test_Y))"
679
+ ]
680
+ },
681
+ {
682
+ "cell_type": "markdown",
683
+ "id": "539cb258",
684
+ "metadata": {},
685
+ "source": [
686
+ "## SVM"
687
+ ]
688
+ },
689
+ {
690
+ "cell_type": "code",
691
+ "execution_count": null,
692
+ "id": "cf9ebed3",
693
+ "metadata": {},
694
+ "outputs": [],
695
+ "source": [
696
+ "#SVM classifier\n",
697
+ "SVM = svm.SVC(C=2.0, kernel='poly',degree=2, gamma='scale')\n",
698
+ "svm_model = SVM.fit(Train_X_Tfidf,Train_Y)\n",
699
+ "predictions_SVM = SVM.predict(Test_X_Tfidf)\n",
700
+ "print(\"SVM Accuracy Score -> \",accuracy_score(predictions_SVM, Test_Y)*100)"
701
+ ]
702
+ },
703
+ {
704
+ "cell_type": "code",
705
+ "execution_count": null,
706
+ "id": "1fbf3e41",
707
+ "metadata": {},
708
+ "outputs": [],
709
+ "source": [
710
+ "print(f1_score(predictions_SVM, Test_Y),precision_score(predictions_SVM, Test_Y),recall_score(predictions_SVM, Test_Y))"
711
+ ]
712
+ },
713
+ {
714
+ "cell_type": "code",
715
+ "execution_count": null,
716
+ "id": "81cf1425",
717
+ "metadata": {},
718
+ "outputs": [],
719
+ "source": [
720
+ "scores = cross_val_score(SVM,Train_X_Tfidf,Train_Y, cv = 5 , scoring = 'f1_macro')\n",
721
+ "scores"
722
+ ]
723
+ },
724
+ {
725
+ "cell_type": "code",
726
+ "execution_count": null,
727
+ "id": "c5a07117",
728
+ "metadata": {},
729
+ "outputs": [],
730
+ "source": [
731
+ "scores = cross_val_score(SVM,Train_X_Tfidf,Train_Y, cv = 10 , scoring = 'f1_macro')\n",
732
+ "scores"
733
+ ]
734
+ },
735
+ {
736
+ "cell_type": "markdown",
737
+ "id": "a4dea60f",
738
+ "metadata": {},
739
+ "source": [
740
+ "## Logistic Regression"
741
+ ]
742
+ },
743
+ {
744
+ "cell_type": "code",
745
+ "execution_count": null,
746
+ "id": "7c96b88d",
747
+ "metadata": {},
748
+ "outputs": [],
749
+ "source": [
750
+ "from sklearn.linear_model import LogisticRegression\n",
751
+ "logisticReg = LogisticRegression()\n",
752
+ "logisticReg.fit(Train_X_Tfidf,Train_Y)\n",
753
+ "predictions_LR = logisticReg.predict(Test_X_Tfidf)\n",
754
+ "print(\"LR Accuracy Score -> \",accuracy_score(predictions_LR, Test_Y)*100)"
755
+ ]
756
+ },
757
+ {
758
+ "cell_type": "code",
759
+ "execution_count": null,
760
+ "id": "47750ca0",
761
+ "metadata": {},
762
+ "outputs": [],
763
+ "source": [
764
+ "print(f1_score(predictions_LR, Test_Y), precision_score(predictions_LR, Test_Y),recall_score(predictions_LR, Test_Y))"
765
+ ]
766
+ },
767
+ {
768
+ "cell_type": "markdown",
769
+ "id": "75efc6b3",
770
+ "metadata": {},
771
+ "source": [
772
+ "## RandomForest"
773
+ ]
774
+ },
775
+ {
776
+ "cell_type": "code",
777
+ "execution_count": null,
778
+ "id": "144104e6",
779
+ "metadata": {},
780
+ "outputs": [],
781
+ "source": [
782
+ "# Apply random forest on the data\n",
783
+ "from sklearn.ensemble import RandomForestClassifier\n",
784
+ "randomForest = RandomForestClassifier(n_estimators = 50) \n",
785
+ "randomForest.fit(Train_X_Tfidf,Train_Y)\n",
786
+ "predictions_RF = logisticReg.predict(Test_X_Tfidf)\n",
787
+ "print(\"LR Accuracy Score -> \",accuracy_score(predictions_RF, Test_Y)*100)"
788
+ ]
789
+ },
790
+ {
791
+ "cell_type": "code",
792
+ "execution_count": null,
793
+ "id": "1f083f5e",
794
+ "metadata": {},
795
+ "outputs": [],
796
+ "source": [
797
+ "print(f1_score(predictions_RF, Test_Y),precision_score(predictions_RF, Test_Y),recall_score(predictions_RF, Test_Y))"
798
+ ]
799
+ },
800
+ {
801
+ "cell_type": "markdown",
802
+ "id": "03fb7cc8",
803
+ "metadata": {},
804
+ "source": [
805
+ "# LIME for classical ML"
806
+ ]
807
+ },
808
+ {
809
+ "cell_type": "code",
810
+ "execution_count": null,
811
+ "id": "41fa18be",
812
+ "metadata": {},
813
+ "outputs": [],
814
+ "source": [
815
+ "import lime\n",
816
+ "import sklearn.ensemble\n",
817
+ "from __future__ import print_function\n",
818
+ "from lime import lime_text\n",
819
+ "from sklearn.pipeline import make_pipeline\n",
820
+ "from lime.lime_text import LimeTextExplainer"
821
+ ]
822
+ },
823
+ {
824
+ "cell_type": "markdown",
825
+ "id": "d952eb5d",
826
+ "metadata": {},
827
+ "source": [
828
+ "## Make the pipeline"
829
+ ]
830
+ },
831
+ {
832
+ "cell_type": "code",
833
+ "execution_count": null,
834
+ "id": "f96a244e",
835
+ "metadata": {},
836
+ "outputs": [],
837
+ "source": [
838
+ "c = make_pipeline(Tfidf_vect, logisticred_model)\n",
839
+ "ls_X_test= list(Test_X)\n",
840
+ "class_names = {0: 'patro', 1:'non-patro'}\n",
841
+ "LIME_explainer = LimeTextExplainer(class_names=class_names)\n"
842
+ ]
843
+ },
844
+ {
845
+ "cell_type": "code",
846
+ "execution_count": null,
847
+ "id": "c0a727a1",
848
+ "metadata": {},
849
+ "outputs": [],
850
+ "source": [
851
+ "idx = 15\n",
852
+ "LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)"
853
+ ]
854
+ },
855
+ {
856
+ "cell_type": "code",
857
+ "execution_count": null,
858
+ "id": "b1755fc8",
859
+ "metadata": {},
860
+ "outputs": [],
861
+ "source": [
862
+ "print('Document id: %d' % idx)\n",
863
+ "print('Text: ', ls_X_test[idx])\n",
864
+ "print('Probability =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])\n",
865
+ "print('True class: %s' % class_names.get(list(Test_Y)[idx]))"
866
+ ]
867
+ },
868
+ {
869
+ "cell_type": "code",
870
+ "execution_count": null,
871
+ "id": "78b0d22e",
872
+ "metadata": {},
873
+ "outputs": [],
874
+ "source": [
875
+ "print(\"1 = non-Patro class, 0 = Patro class\")\n",
876
+ "# show the explainability results with highlighted text\n",
877
+ "LIME_exp.show_in_notebook(text=True)"
878
+ ]
879
+ },
880
+ {
881
+ "cell_type": "code",
882
+ "execution_count": null,
883
+ "id": "e3e16b80",
884
+ "metadata": {},
885
+ "outputs": [],
886
+ "source": [
887
+ "idx = 45\n",
888
+ "LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)\n",
889
+ "print('Document id: %d' % idx)\n",
890
+ "print('Text: ', ls_X_test[idx])\n",
891
+ "print('Probability =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])\n",
892
+ "print('True class: %s' % class_names.get(list(Test_Y)[idx]))"
893
+ ]
894
+ },
895
+ {
896
+ "cell_type": "code",
897
+ "execution_count": null,
898
+ "id": "bd8e838a",
899
+ "metadata": {},
900
+ "outputs": [],
901
+ "source": [
902
+ "print(\"1 = non-Patro class, 0 = Patro class\")\n",
903
+ "# show the explainability results with highlighted text\n",
904
+ "LIME_exp.show_in_notebook(text=True)"
905
+ ]
906
+ },
907
+ {
908
+ "cell_type": "markdown",
909
+ "id": "f8f07e74",
910
+ "metadata": {},
911
+ "source": [
912
+ "# Topic Modeling"
913
+ ]
914
+ },
915
+ {
916
+ "cell_type": "code",
917
+ "execution_count": null,
918
+ "id": "2825b328",
919
+ "metadata": {},
920
+ "outputs": [],
921
+ "source": [
922
+ "import pandas as pd\n",
923
+ "import numpy as np \n",
924
+ "import re\n",
925
+ "from wordcloud import WordCloud\n",
926
+ "import gensim\n",
927
+ "from gensim.utils import simple_preprocess\n",
928
+ "from nltk.corpus import stopwords\n",
929
+ "import gensim.corpora as corpora\n",
930
+ "from pprint import pprint\n",
931
+ "import pyLDAvis.gensim_models\n",
932
+ "import pickle\n",
933
+ "import pyLDAvis"
934
+ ]
935
+ },
936
+ {
937
+ "cell_type": "code",
938
+ "execution_count": null,
939
+ "id": "71ab6908",
940
+ "metadata": {},
941
+ "outputs": [],
942
+ "source": [
943
+ "df = pd.read_csv(\"dontpatronizeme.csv\", names = ['Message','label'])"
944
+ ]
945
+ },
946
+ {
947
+ "cell_type": "code",
948
+ "execution_count": null,
949
+ "id": "0c4a0602",
950
+ "metadata": {},
951
+ "outputs": [],
952
+ "source": [
953
+ "df[\"Message_processed\"] = df[\"Message\"].map(lambda x: re.sub('[,\\.!?]', '', str(x)))\n",
954
+ "df['Message_processed'] = df['Message_processed'].map(lambda x: x.lower())\n",
955
+ "df['Message_processed'].head()"
956
+ ]
957
+ },
958
+ {
959
+ "cell_type": "code",
960
+ "execution_count": null,
961
+ "id": "0e507f49",
962
+ "metadata": {},
963
+ "outputs": [],
964
+ "source": [
965
+ "long_string = ','.join(list(df['Message_processed'].values))# Create a WordCloud object\n",
966
+ "wordcloud = WordCloud(background_color=\"white\", max_words=5000, contour_width=3, contour_color='steelblue')# Generate a word cloud\n",
967
+ "wordcloud.generate(long_string)# Visualize the word cloud\n",
968
+ "wordcloud.to_image()"
969
+ ]
970
+ },
971
+ {
972
+ "cell_type": "code",
973
+ "execution_count": null,
974
+ "id": "76a3f280",
975
+ "metadata": {},
976
+ "outputs": [],
977
+ "source": [
978
+ "stop_words = stopwords.words('english')\n",
979
+ "stop_words.extend(['from', 'subject', 're', 'edu', 'use'])\n",
980
+ "def sent_to_words(sentences):\n",
981
+ " for sentence in sentences:\n",
982
+ " # deacc=True removes punctuations\n",
983
+ " yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n",
984
+ " \n",
985
+ "def remove_stopwords(texts):\n",
986
+ " return [[word for word in simple_preprocess(str(doc)) \n",
987
+ " if word not in stop_words] for doc in texts]\n",
988
+ "data = df.Message_processed.values.tolist()\n",
989
+ "data_words = list(sent_to_words(data))# remove stop words\n",
990
+ "data_words = remove_stopwords(data_words)"
991
+ ]
992
+ },
993
+ {
994
+ "cell_type": "code",
995
+ "execution_count": null,
996
+ "id": "1e257cc3",
997
+ "metadata": {},
998
+ "outputs": [],
999
+ "source": [
1000
+ "print(data_words[:1][0][:30])"
1001
+ ]
1002
+ },
1003
+ {
1004
+ "cell_type": "code",
1005
+ "execution_count": null,
1006
+ "id": "98c5203f",
1007
+ "metadata": {},
1008
+ "outputs": [],
1009
+ "source": [
1010
+ "id2word = corpora.Dictionary(data_words)\n",
1011
+ "texts = data_words# Term Document Frequency\n",
1012
+ "corpus = [id2word.doc2bow(text) for text in texts]# View\n",
1013
+ "print(corpus[:1][0][:30])"
1014
+ ]
1015
+ },
1016
+ {
1017
+ "cell_type": "code",
1018
+ "execution_count": null,
1019
+ "id": "b4a35025",
1020
+ "metadata": {},
1021
+ "outputs": [],
1022
+ "source": [
1023
+ "num_topics = 10# Build LDA model\n",
1024
+ "lda_model = gensim.models.LdaMulticore(corpus=corpus,\n",
1025
+ " id2word=id2word,\n",
1026
+ " num_topics=num_topics)\n",
1027
+ "# Print the Keyword in the 10 topics\n",
1028
+ "pprint(lda_model.print_topics())\n",
1029
+ "doc_lda = lda_model[corpus]"
1030
+ ]
1031
+ },
1032
+ {
1033
+ "cell_type": "code",
1034
+ "execution_count": null,
1035
+ "id": "00346a62",
1036
+ "metadata": {},
1037
+ "outputs": [],
1038
+ "source": [
1039
+ "pyLDAvis.enable_notebook()\n"
1040
+ ]
1041
+ },
1042
+ {
1043
+ "cell_type": "code",
1044
+ "execution_count": null,
1045
+ "id": "f6f7889b",
1046
+ "metadata": {},
1047
+ "outputs": [],
1048
+ "source": [
1049
+ "vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds=\"mmds\", R=30)\n",
1050
+ "vis\n"
1051
+ ]
1052
+ },
1053
+ {
1054
+ "cell_type": "code",
1055
+ "execution_count": null,
1056
+ "id": "e4b7ca16",
1057
+ "metadata": {},
1058
+ "outputs": [],
1059
+ "source": []
1060
+ },
1061
+ {
1062
+ "cell_type": "code",
1063
+ "execution_count": null,
1064
+ "id": "1b214796",
1065
+ "metadata": {},
1066
+ "outputs": [],
1067
+ "source": []
1068
+ },
1069
+ {
1070
+ "cell_type": "code",
1071
+ "execution_count": null,
1072
+ "id": "e7f8e54c",
1073
+ "metadata": {},
1074
+ "outputs": [],
1075
+ "source": []
1076
+ },
1077
+ {
1078
+ "cell_type": "code",
1079
+ "execution_count": null,
1080
+ "id": "021b015f",
1081
+ "metadata": {},
1082
+ "outputs": [],
1083
+ "source": []
1084
+ },
1085
+ {
1086
+ "cell_type": "code",
1087
+ "execution_count": null,
1088
+ "id": "ab1a9490",
1089
+ "metadata": {},
1090
+ "outputs": [],
1091
+ "source": []
1092
+ },
1093
+ {
1094
+ "cell_type": "code",
1095
+ "execution_count": null,
1096
+ "id": "0da95a15",
1097
+ "metadata": {},
1098
+ "outputs": [],
1099
+ "source": []
1100
+ },
1101
+ {
1102
+ "cell_type": "code",
1103
+ "execution_count": null,
1104
+ "id": "22c069c0",
1105
+ "metadata": {},
1106
+ "outputs": [],
1107
+ "source": []
1108
+ },
1109
+ {
1110
+ "cell_type": "code",
1111
+ "execution_count": null,
1112
+ "id": "c02c30f3",
1113
+ "metadata": {},
1114
+ "outputs": [],
1115
+ "source": []
1116
+ },
1117
+ {
1118
+ "cell_type": "code",
1119
+ "execution_count": null,
1120
+ "id": "9cdde3ad",
1121
+ "metadata": {},
1122
+ "outputs": [],
1123
+ "source": []
1124
+ },
1125
+ {
1126
+ "cell_type": "code",
1127
+ "execution_count": null,
1128
+ "id": "717270ef",
1129
+ "metadata": {},
1130
+ "outputs": [],
1131
+ "source": []
1132
+ },
1133
+ {
1134
+ "cell_type": "code",
1135
+ "execution_count": null,
1136
+ "id": "25a8f105",
1137
+ "metadata": {},
1138
+ "outputs": [],
1139
+ "source": []
1140
+ }
1141
+ ],
1142
+ "metadata": {
1143
+ "kernelspec": {
1144
+ "display_name": "Python 3 (ipykernel)",
1145
+ "language": "python",
1146
+ "name": "python3"
1147
+ },
1148
+ "language_info": {
1149
+ "codemirror_mode": {
1150
+ "name": "ipython",
1151
+ "version": 3
1152
+ },
1153
+ "file_extension": ".py",
1154
+ "mimetype": "text/x-python",
1155
+ "name": "python",
1156
+ "nbconvert_exporter": "python",
1157
+ "pygments_lexer": "ipython3",
1158
+ "version": "3.9.7"
1159
+ }
1160
+ },
1161
+ "nbformat": 4,
1162
+ "nbformat_minor": 5
1163
+ }