{ "cells": [ { "cell_type": "markdown", "id": "060994f2", "metadata": {}, "source": [ "# importing the necessary libraries" ] }, { "cell_type": "code", "execution_count": null, "id": "033ebd27", "metadata": {}, "outputs": [], "source": [ "# imports - native Python\n", "import collections\n", "import csv\n", "import os\n", "import re\n", "# imports - 3rd party\n", "from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n", "# installs from 🤗\n", "! pip install transformers\n", "! pip install datasets\n", "from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer\n", "from datasets import Dataset, DatasetDict" ] }, { "cell_type": "code", "execution_count": null, "id": "0214c70f", "metadata": {}, "outputs": [], "source": [ "import torch\n", "torch.cuda.empty_cache()" ] }, { "cell_type": "markdown", "id": "13732b06", "metadata": {}, "source": [ "# Loading the data" ] }, { "cell_type": "code", "execution_count": null, "id": "e5a782b3", "metadata": {}, "outputs": [], "source": [ "# Using csv instead of pandas for sanity and to do filtering while loading\n", "\n", "# make parallel lists of texts and labels\n", "# texts: strings containing messages\n", "dataset_dict = {'text':[], 'label':[]}\n", "for f in os.listdir():\n", " # use all .tsv files that have been loaded\n", " if f.endswith('dontpatronizeme.tsv'):\n", " with open(f) as tsv_file:\n", " reader = csv.DictReader(tsv_file, dialect='excel-tab')\n", " for line in reader:\n", " text = line['text']\n", " # a few of the Message fields are empty, so we should skip those ones\n", " if text!=None and text.strip()!=\"\":\n", " dataset_dict['text'].append(text)\n", " dataset_dict['label'].append(int(line['label']))\n", "# huggingface function to convert from dict to their Dataset object\n", "# which will work nicely with their model trainer\n", "ds = Dataset.from_dict(dataset_dict)" ] }, { "cell_type": "markdown", "id": "52379811", "metadata": {}, "source": [ "# Creating train, valid, test splits" ] }, { "cell_type": "code", "execution_count": null, "id": "a6f69bc1", "metadata": {}, "outputs": [], "source": [ "# no function to split into train/validation/test so we do 2 separate splits\n", "# first split 80-20 into train and test+validation\n", "train_testvalid = ds.train_test_split(test_size=0.2)\n", "# then split the 20 into 10-10 validation and test\n", "test_valid = train_testvalid['test'].train_test_split(test_size=0.5)\n", "# finally, make the full dataset the 80-10-10 split as a DatasetDict object\n", "train_test_valid_dataset = DatasetDict({\n", " 'train': train_testvalid['train'],\n", " 'test': test_valid['test'],\n", " 'valid': test_valid['train']})\n", "# quick check (if this doesn't pass, will get an error in the tokenization)\n", "# makes sure we filtered the data correcly at the beginning and removed None\n", "for split in train_test_valid_dataset.keys():\n", " assert not any([x==None for x in train_test_valid_dataset[split]['text']])" ] }, { "cell_type": "markdown", "id": "0dfcc029", "metadata": {}, "source": [ "# Tokenizer" ] }, { "cell_type": "markdown", "id": "b2cb0082", "metadata": {}, "source": [ "This is the tokenizer for the distilbert model" ] }, { "cell_type": "code", "execution_count": null, "id": "65a26dc2", "metadata": {}, "outputs": [], "source": [ "# just use the default tokenizer for the model\n", "tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')\n", "\n", "# simple wrapper\n", "def tokenize(examples, textfield=\"text\"):\n", " return tokenizer(examples[textfield], padding=\"max_length\", truncation=True)\n", "\n", "# batch tokenization\n", "tokenized_datasets = train_test_valid_dataset.map(tokenize, batched=True)" ] }, { "cell_type": "markdown", "id": "38a15ebb", "metadata": {}, "source": [ "Below are the examples for also the RoBERTa model and the BERT model" ] }, { "cell_type": "code", "execution_count": null, "id": "8f45cf1d", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForMaskedLM\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n", "\n", "model = AutoModelForMaskedLM.from_pretrained(\"bert-base-uncased\")" ] }, { "cell_type": "code", "execution_count": null, "id": "79d33a06", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForMaskedLM\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"roberta-base\")\n", "\n", "model = AutoModelForMaskedLM.from_pretrained(\"roberta-base\")" ] }, { "cell_type": "markdown", "id": "9b550e83", "metadata": {}, "source": [ "# Model " ] }, { "cell_type": "code", "execution_count": null, "id": "12c960c0", "metadata": {}, "outputs": [], "source": [ "# Setup collation\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", "\n", "# Load model\n", "model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)" ] }, { "cell_type": "markdown", "id": "d4342956", "metadata": {}, "source": [ "# Computing the metrics and training args" ] }, { "cell_type": "code", "execution_count": null, "id": "4c974458", "metadata": {}, "outputs": [], "source": [ "# using sklearn to compute precision, recall, f1, and accuracy\n", "def compute_metrics(pred):\n", " labels = pred.label_ids\n", " preds = pred.predictions.argmax(-1)\n", " precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')\n", " acc = accuracy_score(labels, preds)\n", " return {\n", " 'accuracy': acc,\n", " 'f1': f1,\n", " 'precision': precision,\n", " 'recall': recall\n", " }" ] }, { "cell_type": "code", "execution_count": null, "id": "8c4fb414", "metadata": {}, "outputs": [], "source": [ "# Set training args (just using defaults from the following tutorial for now:\n", "# https://huggingface.co/docs/transformers/training )\n", "training_args = TrainingArguments(\n", " output_dir=\"./results\",\n", " learning_rate=2e-5,\n", " per_device_train_batch_size=16,\n", " per_device_eval_batch_size=16,\n", " num_train_epochs=5,\n", " weight_decay=0.01,\n", ")\n", "\n", "# setup the trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_datasets[\"train\"],\n", " eval_dataset=tokenized_datasets[\"valid\"],\n", " tokenizer=tokenizer,\n", " data_collator=data_collator,\n", " compute_metrics=compute_metrics,\n", ")" ] }, { "cell_type": "markdown", "id": "cb346507", "metadata": {}, "source": [ "# Train model and Evaluate" ] }, { "cell_type": "code", "execution_count": null, "id": "de170b1e", "metadata": {}, "outputs": [], "source": [ "# train the model\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "id": "48adbaed", "metadata": {}, "outputs": [], "source": [ "# evaluate on the test set\n", "# should only do for _best_ model of each type \n", "# after selecting hyperparameters that work best on validation set\n", "trainer.evaluate(tokenized_datasets[\"test\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "c3dea644", "metadata": {}, "outputs": [], "source": [ "##!pip install huggingface_hub\n", "#!sudo apt-get install fit-lfs\n", "#!huggingface-cli login\n", "#!git clone https://huggingface.co/achyut/patronizing_detection\n", "#cd /content/patronizing_detection" ] }, { "cell_type": "markdown", "id": "539c8683", "metadata": {}, "source": [ "# LIME for Deep Learning Models" ] }, { "cell_type": "code", "execution_count": null, "id": "9f7c2cab", "metadata": {}, "outputs": [], "source": [ "# LIME importing all the necessary libraries\n", "import numpy as np\n", "import lime\n", "import torch\n", "import torch.nn.functional as F\n", "from lime.lime_text import LimeTextExplainer\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification" ] }, { "cell_type": "code", "execution_count": null, "id": "d53f4b7d", "metadata": {}, "outputs": [], "source": [ "# Set the class names\n", "class_names = ['non-patronizing','patronizing']" ] }, { "cell_type": "markdown", "id": "2d91f290", "metadata": {}, "source": [ "For LIME and other interpretable AI models, we Have to use the tokenizer and the model of the fine-tuned pretrained model. Not the Huggingface un fine tuned model. That is because we want to use the model with the trained weights, tokens and vocab" ] }, { "cell_type": "code", "execution_count": null, "id": "e2381d7b", "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(\"achyut/patronizing_detection\")\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(\"achyut/patronizing_detection\")" ] }, { "cell_type": "code", "execution_count": null, "id": "318859d6", "metadata": {}, "outputs": [], "source": [ "model.cuda()" ] }, { "cell_type": "code", "execution_count": null, "id": "99a7e69f", "metadata": {}, "outputs": [], "source": [ "!pip install more_itertools\n" ] }, { "cell_type": "markdown", "id": "c810588c", "metadata": {}, "source": [ "# The function that calculates the logits for each sequence. " ] }, { "cell_type": "code", "execution_count": null, "id": "c3db6441", "metadata": {}, "outputs": [], "source": [ "import more_itertools\n", "def predictor4(texts, batch_size=64):\n", " probas = []\n", " for chunk in more_itertools.chunked(texts, batch_size):\n", " tokenized = tokenizer(chunk, return_tensors=\"pt\", padding=True)\n", " outputs = model(tokenized['input_ids'].to('cuda'), tokenized['attention_mask'].to('cuda'))\n", " probas.append(F.softmax(outputs.logits).cpu().detach().numpy())\n", " return np.vstack(probas)" ] }, { "cell_type": "code", "execution_count": null, "id": "1074572d", "metadata": {}, "outputs": [], "source": [ "predictor4([\"I have two dogs\",\"The keep barking\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "661d8281", "metadata": {}, "outputs": [], "source": [ "explainer = LimeTextExplainer(class_names=class_names)" ] }, { "cell_type": "code", "execution_count": null, "id": "abb9b201", "metadata": {}, "outputs": [], "source": [ "str_to_predict = ds[6]['text']\n", "exp = explainer.explain_instance(str_to_predict, predictor4, num_features= 25, num_samples = 2000)\n", "exp.show_in_notebook(text=str_to_predict)" ] }, { "cell_type": "code", "execution_count": null, "id": "1885619b", "metadata": {}, "outputs": [], "source": [ "exp.as_list()" ] }, { "cell_type": "code", "execution_count": null, "id": "5f004287", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "42dfbb84", "metadata": {}, "source": [ "# classical Machine Learning" ] }, { "cell_type": "code", "execution_count": null, "id": "94835013", "metadata": {}, "outputs": [], "source": [ "import collections\n", "import csv\n", "import os\n", "import re\n", "import pandas as pd\n", "import numpy as np\n", "from nltk.tokenize import word_tokenize\n", "from sklearn.preprocessing import LabelEncoder\n", "from collections import defaultdict\n", "from nltk.corpus import wordnet as wn\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn import model_selection, naive_bayes, svm\n", "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score\n", "from nltk import pos_tag\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer" ] }, { "cell_type": "code", "execution_count": null, "id": "8605ed57", "metadata": {}, "outputs": [], "source": [ "# We can use a seed if we want reproducibility\n", "#np.random.seed(500)" ] }, { "cell_type": "code", "execution_count": null, "id": "5475808d", "metadata": {}, "outputs": [], "source": [ "import nltk\n", "nltk.download('wordnet')" ] }, { "cell_type": "code", "execution_count": null, "id": "c3745eee", "metadata": {}, "outputs": [], "source": [ "import nltk\n", "nltk.download('averaged_perceptron_tagger')" ] }, { "cell_type": "code", "execution_count": null, "id": "180f42bf", "metadata": {}, "outputs": [], "source": [ "Corpus = pd.read_csv(\"patro_downsampled.csv\", names = ['text','label'])\n", "# change it to str, lower case and drop the na values\n", "Corpus.text = Corpus.text.astype(str)\n", "Corpus['text'] = Corpus['text'].str.lower()\n", "Corpus = Corpus.dropna()\n", "Corpus.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "5f9d00c8", "metadata": {}, "outputs": [], "source": [ "Corpus.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "659d463e", "metadata": {}, "outputs": [], "source": [ "#tokenizing our para text column here\n", "Corpus['text'] = Corpus['text'].apply(nltk.word_tokenize)\n", "\n", "# Tagging to understand if the word is a noun, verb, adverb etc\n", "\n", "tag_map = defaultdict(lambda : wn.NOUN)\n", "tag_map['J'] = wn.ADJ\n", "tag_map['V'] = wn.VERB\n", "tag_map['R'] = wn.ADV" ] }, { "cell_type": "code", "execution_count": null, "id": "5af9ea94", "metadata": {}, "outputs": [], "source": [ "for index,entry in enumerate(Corpus['text']):\n", " # empty list which I will append to the df in the end.\n", " Final_words = []\n", " \n", " word_Lemmatized = WordNetLemmatizer()\n", " for word, tag in pos_tag(entry):\n", " # check for Stop words and consider only alphabets\n", " if word not in stopwords.words('english') and word.isalpha():\n", " word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])\n", " Final_words.append(word_Final)\n", " # The final processed set of words for each iteration will be stored in 'text_final'\n", " Corpus.loc[index,'text_final'] = str(Final_words)" ] }, { "cell_type": "code", "execution_count": null, "id": "8c6d9bc6", "metadata": {}, "outputs": [], "source": [ "Corpus.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "f654c4ab", "metadata": {}, "outputs": [], "source": [ "#Train, test split\n", "Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],\n", " Corpus['label'],\n", " test_size=0.2)" ] }, { "cell_type": "code", "execution_count": null, "id": "00747dbd", "metadata": {}, "outputs": [], "source": [ "#Encoding our labels\n", "Encoder = LabelEncoder()\n", "Train_Y = Encoder.fit_transform(Train_Y)\n", "Test_Y = Encoder.fit_transform(Test_Y)\n", "\n", "# Vectorizer\n", "Tfidf_vect = TfidfVectorizer()\n", "\n", "Tfidf_vect.fit(Corpus['text_final'])" ] }, { "cell_type": "code", "execution_count": null, "id": "95b89126", "metadata": {}, "outputs": [], "source": [ "# Transforming the train and test inputs into vectors\n", "Train_X_Tfidf = Tfidf_vect.transform(Train_X)\n", "Test_X_Tfidf = Tfidf_vect.transform(Test_X)\n", "print(len(Tfidf_vect.vocabulary_))" ] }, { "cell_type": "markdown", "id": "1da1f215", "metadata": {}, "source": [ "# Fitting Models" ] }, { "cell_type": "markdown", "id": "b8d618cd", "metadata": {}, "source": [ "## NaiveBayes" ] }, { "cell_type": "code", "execution_count": null, "id": "7613821b", "metadata": {}, "outputs": [], "source": [ "# fit the NB classifier\n", "Naive = naive_bayes.MultinomialNB()\n", "naive_model = Naive.fit(Train_X_Tfidf,Train_Y)\n", "predictions_NB = Naive.predict(Test_X_Tfidf)\n", "print(\"Naive Bayes Accuracy Score -> \",accuracy_score(predictions_NB, Test_Y)*100)" ] }, { "cell_type": "code", "execution_count": null, "id": "d04b0813", "metadata": {}, "outputs": [], "source": [ "print(f1_score(predictions_NB, Test_Y),precision_score(predictions_NB, Test_Y),recall_score(predictions_NB, Test_Y))" ] }, { "cell_type": "markdown", "id": "539cb258", "metadata": {}, "source": [ "## SVM" ] }, { "cell_type": "code", "execution_count": null, "id": "cf9ebed3", "metadata": {}, "outputs": [], "source": [ "#SVM classifier\n", "SVM = svm.SVC(C=2.0, kernel='poly',degree=2, gamma='scale')\n", "svm_model = SVM.fit(Train_X_Tfidf,Train_Y)\n", "predictions_SVM = SVM.predict(Test_X_Tfidf)\n", "print(\"SVM Accuracy Score -> \",accuracy_score(predictions_SVM, Test_Y)*100)" ] }, { "cell_type": "code", "execution_count": null, "id": "1fbf3e41", "metadata": {}, "outputs": [], "source": [ "print(f1_score(predictions_SVM, Test_Y),precision_score(predictions_SVM, Test_Y),recall_score(predictions_SVM, Test_Y))" ] }, { "cell_type": "code", "execution_count": null, "id": "81cf1425", "metadata": {}, "outputs": [], "source": [ "scores = cross_val_score(SVM,Train_X_Tfidf,Train_Y, cv = 5 , scoring = 'f1_macro')\n", "scores" ] }, { "cell_type": "code", "execution_count": null, "id": "c5a07117", "metadata": {}, "outputs": [], "source": [ "scores = cross_val_score(SVM,Train_X_Tfidf,Train_Y, cv = 10 , scoring = 'f1_macro')\n", "scores" ] }, { "cell_type": "markdown", "id": "a4dea60f", "metadata": {}, "source": [ "## Logistic Regression" ] }, { "cell_type": "code", "execution_count": null, "id": "7c96b88d", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "logisticReg = LogisticRegression()\n", "logisticReg.fit(Train_X_Tfidf,Train_Y)\n", "predictions_LR = logisticReg.predict(Test_X_Tfidf)\n", "print(\"LR Accuracy Score -> \",accuracy_score(predictions_LR, Test_Y)*100)" ] }, { "cell_type": "code", "execution_count": null, "id": "47750ca0", "metadata": {}, "outputs": [], "source": [ "print(f1_score(predictions_LR, Test_Y), precision_score(predictions_LR, Test_Y),recall_score(predictions_LR, Test_Y))" ] }, { "cell_type": "markdown", "id": "75efc6b3", "metadata": {}, "source": [ "## RandomForest" ] }, { "cell_type": "code", "execution_count": null, "id": "144104e6", "metadata": {}, "outputs": [], "source": [ "# Apply random forest on the data\n", "from sklearn.ensemble import RandomForestClassifier\n", "randomForest = RandomForestClassifier(n_estimators = 50) \n", "randomForest.fit(Train_X_Tfidf,Train_Y)\n", "predictions_RF = logisticReg.predict(Test_X_Tfidf)\n", "print(\"LR Accuracy Score -> \",accuracy_score(predictions_RF, Test_Y)*100)" ] }, { "cell_type": "code", "execution_count": null, "id": "1f083f5e", "metadata": {}, "outputs": [], "source": [ "print(f1_score(predictions_RF, Test_Y),precision_score(predictions_RF, Test_Y),recall_score(predictions_RF, Test_Y))" ] }, { "cell_type": "markdown", "id": "03fb7cc8", "metadata": {}, "source": [ "# LIME for classical ML" ] }, { "cell_type": "code", "execution_count": null, "id": "41fa18be", "metadata": {}, "outputs": [], "source": [ "import lime\n", "import sklearn.ensemble\n", "from __future__ import print_function\n", "from lime import lime_text\n", "from sklearn.pipeline import make_pipeline\n", "from lime.lime_text import LimeTextExplainer" ] }, { "cell_type": "markdown", "id": "d952eb5d", "metadata": {}, "source": [ "## Make the pipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "f96a244e", "metadata": {}, "outputs": [], "source": [ "c = make_pipeline(Tfidf_vect, logisticred_model)\n", "ls_X_test= list(Test_X)\n", "class_names = {0: 'patro', 1:'non-patro'}\n", "LIME_explainer = LimeTextExplainer(class_names=class_names)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c0a727a1", "metadata": {}, "outputs": [], "source": [ "idx = 15\n", "LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)" ] }, { "cell_type": "code", "execution_count": null, "id": "b1755fc8", "metadata": {}, "outputs": [], "source": [ "print('Document id: %d' % idx)\n", "print('Text: ', ls_X_test[idx])\n", "print('Probability =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])\n", "print('True class: %s' % class_names.get(list(Test_Y)[idx]))" ] }, { "cell_type": "code", "execution_count": null, "id": "78b0d22e", "metadata": {}, "outputs": [], "source": [ "print(\"1 = non-Patro class, 0 = Patro class\")\n", "# show the explainability results with highlighted text\n", "LIME_exp.show_in_notebook(text=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "e3e16b80", "metadata": {}, "outputs": [], "source": [ "idx = 45\n", "LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)\n", "print('Document id: %d' % idx)\n", "print('Text: ', ls_X_test[idx])\n", "print('Probability =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])\n", "print('True class: %s' % class_names.get(list(Test_Y)[idx]))" ] }, { "cell_type": "code", "execution_count": null, "id": "bd8e838a", "metadata": {}, "outputs": [], "source": [ "print(\"1 = non-Patro class, 0 = Patro class\")\n", "# show the explainability results with highlighted text\n", "LIME_exp.show_in_notebook(text=True)" ] }, { "cell_type": "markdown", "id": "f8f07e74", "metadata": {}, "source": [ "# Topic Modeling" ] }, { "cell_type": "code", "execution_count": null, "id": "2825b328", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np \n", "import re\n", "from wordcloud import WordCloud\n", "import gensim\n", "from gensim.utils import simple_preprocess\n", "from nltk.corpus import stopwords\n", "import gensim.corpora as corpora\n", "from pprint import pprint\n", "import pyLDAvis.gensim_models\n", "import pickle\n", "import pyLDAvis" ] }, { "cell_type": "code", "execution_count": null, "id": "71ab6908", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"dontpatronizeme.csv\", names = ['Message','label'])" ] }, { "cell_type": "code", "execution_count": null, "id": "0c4a0602", "metadata": {}, "outputs": [], "source": [ "df[\"Message_processed\"] = df[\"Message\"].map(lambda x: re.sub('[,\\.!?]', '', str(x)))\n", "df['Message_processed'] = df['Message_processed'].map(lambda x: x.lower())\n", "df['Message_processed'].head()" ] }, { "cell_type": "code", "execution_count": null, "id": "0e507f49", "metadata": {}, "outputs": [], "source": [ "long_string = ','.join(list(df['Message_processed'].values))# Create a WordCloud object\n", "wordcloud = WordCloud(background_color=\"white\", max_words=5000, contour_width=3, contour_color='steelblue')# Generate a word cloud\n", "wordcloud.generate(long_string)# Visualize the word cloud\n", "wordcloud.to_image()" ] }, { "cell_type": "code", "execution_count": null, "id": "76a3f280", "metadata": {}, "outputs": [], "source": [ "stop_words = stopwords.words('english')\n", "stop_words.extend(['from', 'subject', 're', 'edu', 'use'])\n", "def sent_to_words(sentences):\n", " for sentence in sentences:\n", " # deacc=True removes punctuations\n", " yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n", " \n", "def remove_stopwords(texts):\n", " return [[word for word in simple_preprocess(str(doc)) \n", " if word not in stop_words] for doc in texts]\n", "data = df.Message_processed.values.tolist()\n", "data_words = list(sent_to_words(data))# remove stop words\n", "data_words = remove_stopwords(data_words)" ] }, { "cell_type": "code", "execution_count": null, "id": "1e257cc3", "metadata": {}, "outputs": [], "source": [ "print(data_words[:1][0][:30])" ] }, { "cell_type": "code", "execution_count": null, "id": "98c5203f", "metadata": {}, "outputs": [], "source": [ "id2word = corpora.Dictionary(data_words)\n", "texts = data_words# Term Document Frequency\n", "corpus = [id2word.doc2bow(text) for text in texts]# View\n", "print(corpus[:1][0][:30])" ] }, { "cell_type": "code", "execution_count": null, "id": "b4a35025", "metadata": {}, "outputs": [], "source": [ "num_topics = 10# Build LDA model\n", "lda_model = gensim.models.LdaMulticore(corpus=corpus,\n", " id2word=id2word,\n", " num_topics=num_topics)\n", "# Print the Keyword in the 10 topics\n", "pprint(lda_model.print_topics())\n", "doc_lda = lda_model[corpus]" ] }, { "cell_type": "code", "execution_count": null, "id": "00346a62", "metadata": {}, "outputs": [], "source": [ "pyLDAvis.enable_notebook()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f6f7889b", "metadata": {}, "outputs": [], "source": [ "vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds=\"mmds\", R=30)\n", "vis\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e4b7ca16", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1b214796", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e7f8e54c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "021b015f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ab1a9490", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0da95a15", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "22c069c0", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c02c30f3", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "9cdde3ad", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "717270ef", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "25a8f105", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }