# importing the necessary libraries

In [None]:
# imports - native Python
import collections
import csv
import os
import re
# imports - 3rd party
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
# installs from 🤗
! pip install transformers
! pip install datasets
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

In [None]:
import torch
torch.cuda.empty_cache()

# Loading the data

In [None]:
# Using csv instead of pandas for sanity and to do filtering while loading

# make parallel lists of texts and labels
# texts: strings containing messages
dataset_dict = {'text':[], 'label':[]}
for f in os.listdir():
 # use all .tsv files that have been loaded
 if f.endswith('dontpatronizeme.tsv'):
 with open(f) as tsv_file:
 reader = csv.DictReader(tsv_file, dialect='excel-tab')
 for line in reader:
 text = line['text']
 # a few of the Message fields are empty, so we should skip those ones
 if text!=None and text.strip()!="":
 dataset_dict['text'].append(text)
 dataset_dict['label'].append(int(line['label']))
# huggingface function to convert from dict to their Dataset object
# which will work nicely with their model trainer
ds = Dataset.from_dict(dataset_dict)

# Creating train, valid, test splits

In [None]:
# no function to split into train/validation/test so we do 2 separate splits
# first split 80-20 into train and test+validation
train_testvalid = ds.train_test_split(test_size=0.2)
# then split the 20 into 10-10 validation and test
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# finally, make the full dataset the 80-10-10 split as a DatasetDict object
train_test_valid_dataset = DatasetDict({
 'train': train_testvalid['train'],
 'test': test_valid['test'],
 'valid': test_valid['train']})
# quick check (if this doesn't pass, will get an error in the tokenization)
# makes sure we filtered the data correcly at the beginning and removed None
for split in train_test_valid_dataset.keys():
 assert not any([x==None for x in train_test_valid_dataset[split]['text']])

# Tokenizer

This is the tokenizer for the distilbert model

In [None]:
# just use the default tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# simple wrapper
def tokenize(examples, textfield="text"):
 return tokenizer(examples[textfield], padding="max_length", truncation=True)

# batch tokenization
tokenized_datasets = train_test_valid_dataset.map(tokenize, batched=True)

Below are the examples for also the RoBERTa model and the BERT model

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

model = AutoModelForMaskedLM.from_pretrained("roberta-base")

# Model 

In [None]:
# Setup collation
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Computing the metrics and training args

In [None]:
# using sklearn to compute precision, recall, f1, and accuracy
def compute_metrics(pred):
 labels = pred.label_ids
 preds = pred.predictions.argmax(-1)
 precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
 acc = accuracy_score(labels, preds)
 return {
 'accuracy': acc,
 'f1': f1,
 'precision': precision,
 'recall': recall
 }

In [None]:
# Set training args (just using defaults from the following tutorial for now:
# https://huggingface.co/docs/transformers/training )
training_args = TrainingArguments(
 output_dir="./results",
 learning_rate=2e-5,
 per_device_train_batch_size=16,
 per_device_eval_batch_size=16,
 num_train_epochs=5,
 weight_decay=0.01,
)

# setup the trainer
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=tokenized_datasets["train"],
 eval_dataset=tokenized_datasets["valid"],
 tokenizer=tokenizer,
 data_collator=data_collator,
 compute_metrics=compute_metrics,
)

# Train model and Evaluate

In [None]:
# train the model
trainer.train()

In [None]:
# evaluate on the test set
# should only do for _best_ model of each type 
# after selecting hyperparameters that work best on validation set
trainer.evaluate(tokenized_datasets["test"])

In [None]:
##!pip install huggingface_hub
#!sudo apt-get install fit-lfs
#!huggingface-cli login
#!git clone https://huggingface.co/achyut/patronizing_detection
#cd /content/patronizing_detection

# LIME for Deep Learning Models

In [None]:
# LIME importing all the necessary libraries
import numpy as np
import lime
import torch
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
# Set the class names
class_names = ['non-patronizing','patronizing']

For LIME and other interpretable AI models, we Have to use the tokenizer and the model of the fine-tuned pretrained model. Not the Huggingface un fine tuned model. That is because we want to use the model with the trained weights, tokens and vocab

In [None]:
tokenizer = AutoTokenizer.from_pretrained("achyut/patronizing_detection")

model = AutoModelForSequenceClassification.from_pretrained("achyut/patronizing_detection")

In [None]:
model.cuda()

In [None]:
!pip install more_itertools


# The function that calculates the logits for each sequence. 

In [None]:
import more_itertools
def predictor4(texts, batch_size=64):
 probas = []
 for chunk in more_itertools.chunked(texts, batch_size):
 tokenized = tokenizer(chunk, return_tensors="pt", padding=True)
 outputs = model(tokenized['input_ids'].to('cuda'), tokenized['attention_mask'].to('cuda'))
 probas.append(F.softmax(outputs.logits).cpu().detach().numpy())
 return np.vstack(probas)

In [None]:
predictor4(["I have two dogs","The keep barking"])

In [None]:
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
str_to_predict = ds[6]['text']
exp = explainer.explain_instance(str_to_predict, predictor4, num_features= 25, num_samples = 2000)
exp.show_in_notebook(text=str_to_predict)

In [None]:
exp.as_list()

# classical Machine Learning

In [None]:
import collections
import csv
import os
import re
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# We can use a seed if we want reproducibility
#np.random.seed(500)

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

In [None]:
Corpus = pd.read_csv("patro_downsampled.csv", names = ['text','label'])
# change it to str, lower case and drop the na values
Corpus.text = Corpus.text.astype(str)
Corpus['text'] = Corpus['text'].str.lower()
Corpus = Corpus.dropna()
Corpus.head()

In [None]:
Corpus.info()

In [None]:
#tokenizing our para text column here
Corpus['text'] = Corpus['text'].apply(nltk.word_tokenize)

# Tagging to understand if the word is a noun, verb, adverb etc

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [None]:
for index,entry in enumerate(Corpus['text']):
 # empty list which I will append to the df in the end.
 Final_words = []
 
 word_Lemmatized = WordNetLemmatizer()
 for word, tag in pos_tag(entry):
 # check for Stop words and consider only alphabets
 if word not in stopwords.words('english') and word.isalpha():
 word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
 Final_words.append(word_Final)
 # The final processed set of words for each iteration will be stored in 'text_final'
 Corpus.loc[index,'text_final'] = str(Final_words)

In [None]:
Corpus.head()

In [None]:
#Train, test split
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],
 Corpus['label'],
 test_size=0.2)

In [None]:
#Encoding our labels
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

# Vectorizer
Tfidf_vect = TfidfVectorizer()

Tfidf_vect.fit(Corpus['text_final'])

In [None]:
# Transforming the train and test inputs into vectors
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
print(len(Tfidf_vect.vocabulary_))

# Fitting Models

## NaiveBayes

In [None]:
# fit the NB classifier
Naive = naive_bayes.MultinomialNB()
naive_model = Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

In [None]:
print(f1_score(predictions_NB, Test_Y),precision_score(predictions_NB, Test_Y),recall_score(predictions_NB, Test_Y))

## SVM

In [None]:
#SVM classifier
SVM = svm.SVC(C=2.0, kernel='poly',degree=2, gamma='scale')
svm_model = SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

In [None]:
print(f1_score(predictions_SVM, Test_Y),precision_score(predictions_SVM, Test_Y),recall_score(predictions_SVM, Test_Y))

In [None]:
scores = cross_val_score(SVM,Train_X_Tfidf,Train_Y, cv = 5 , scoring = 'f1_macro')
scores

In [None]:
scores = cross_val_score(SVM,Train_X_Tfidf,Train_Y, cv = 10 , scoring = 'f1_macro')
scores

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logisticReg = LogisticRegression()
logisticReg.fit(Train_X_Tfidf,Train_Y)
predictions_LR = logisticReg.predict(Test_X_Tfidf)
print("LR Accuracy Score -> ",accuracy_score(predictions_LR, Test_Y)*100)

In [None]:
print(f1_score(predictions_LR, Test_Y), precision_score(predictions_LR, Test_Y),recall_score(predictions_LR, Test_Y))

## RandomForest

In [None]:
# Apply random forest on the data
from sklearn.ensemble import RandomForestClassifier
randomForest = RandomForestClassifier(n_estimators = 50) 
randomForest.fit(Train_X_Tfidf,Train_Y)
predictions_RF = logisticReg.predict(Test_X_Tfidf)
print("LR Accuracy Score -> ",accuracy_score(predictions_RF, Test_Y)*100)

In [None]:
print(f1_score(predictions_RF, Test_Y),precision_score(predictions_RF, Test_Y),recall_score(predictions_RF, Test_Y))

# LIME for classical ML

In [None]:
import lime
import sklearn.ensemble
from __future__ import print_function
from lime import lime_text
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

## Make the pipeline

In [None]:
c = make_pipeline(Tfidf_vect, logisticred_model)
ls_X_test= list(Test_X)
class_names = {0: 'patro', 1:'non-patro'}
LIME_explainer = LimeTextExplainer(class_names=class_names)


In [None]:
idx = 15
LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)

In [None]:
print('Document id: %d' % idx)
print('Text: ', ls_X_test[idx])
print('Probability =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])
print('True class: %s' % class_names.get(list(Test_Y)[idx]))

In [None]:
print("1 = non-Patro class, 0 = Patro class")
# show the explainability results with highlighted text
LIME_exp.show_in_notebook(text=True)

In [None]:
idx = 45
LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)
print('Document id: %d' % idx)
print('Text: ', ls_X_test[idx])
print('Probability =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])
print('True class: %s' % class_names.get(list(Test_Y)[idx]))

In [None]:
print("1 = non-Patro class, 0 = Patro class")
# show the explainability results with highlighted text
LIME_exp.show_in_notebook(text=True)

# Topic Modeling

In [None]:
import pandas as pd
import numpy as np 
import re
from wordcloud import WordCloud
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import gensim.corpora as corpora
from pprint import pprint
import pyLDAvis.gensim_models
import pickle
import pyLDAvis

In [None]:
df = pd.read_csv("dontpatronizeme.csv", names = ['Message','label'])

In [None]:
df["Message_processed"] = df["Message"].map(lambda x: re.sub('[,\.!?]', '', str(x)))
df['Message_processed'] = df['Message_processed'].map(lambda x: x.lower())
df['Message_processed'].head()

In [None]:
long_string = ','.join(list(df['Message_processed'].values))# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')# Generate a word cloud
wordcloud.generate(long_string)# Visualize the word cloud
wordcloud.to_image()

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def sent_to_words(sentences):
 for sentence in sentences:
 # deacc=True removes punctuations
 yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
 
def remove_stopwords(texts):
 return [[word for word in simple_preprocess(str(doc)) 
 if word not in stop_words] for doc in texts]
data = df.Message_processed.values.tolist()
data_words = list(sent_to_words(data))# remove stop words
data_words = remove_stopwords(data_words)

In [None]:
print(data_words[:1][0][:30])

In [None]:
id2word = corpora.Dictionary(data_words)
texts = data_words# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]# View
print(corpus[:1][0][:30])

In [None]:
num_topics = 10# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
 id2word=id2word,
 num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
pyLDAvis.enable_notebook()


In [None]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis
