caps / app.py
vikranth1111's picture
Update app.py
d563dfb
#!/usr/bin/env python
# coding: utf-8
# In[6]:
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import nltk, re, string
from string import punctuation
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn.metrics import accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,cross_val_score
#machine learning
from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression
# machine learning
from sklearn.naive_bayes import MultinomialNB,GaussianNB
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# In[20]:
# In[7]:
# In[9]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import nltk
nltk.download()
# In[10]:
df = pd.read_csv('/home/user/disaster_tweets.csv')
df.head()
# In[11]:
df.info()
# ## Target Distribution
# In[12]:
sns.set_style("dark")
sns.countplot(df.target)
# In[13]:
# craeteing new column for storing length of reviews
df['length'] = df['text'].apply(len)
df.head()
# In[14]:
df['length'].plot(bins=50, kind='hist')
# In[15]:
df.length.describe()
# In[16]:
df[df['length'] == 157]['text'].iloc[0]
# In[17]:
df.hist(column='length', by='target', bins=50,figsize=(10,4))
# In[18]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)
# Removing stop words which are unneccesary from headline news
def remove_stopwords(text):
final_text = []
for i in text.split():
if i.strip().lower() not in stop:
final_text.append(i.strip())
return " ".join(final_text)
df_1 = df[df['target']==1]
df_0 = df[df['target']==0]
df_1['text']=df_1['text'].apply(remove_stopwords)
df_0['text']=df_0['text'].apply(remove_stopwords)
# ## Plotting wordcloud of Disaster Tweets
# In[21]:
from wordcloud import WordCloud
plt.figure(figsize = (20,20)) # Text that is Disaster tweets
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(df_1.text))
plt.imshow(wc , interpolation = 'bilinear')
# ## Plotting wordcloud of Normal Tweets
# In[22]:
plt.figure(figsize = (20,20)) # Text that is Normal Tweets
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(df_0.text))
plt.imshow(wc , interpolation = 'bilinear')
# ## Data Cleaning and Preparation
# In[23]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
#creating list of possible stopwords from nltk library
stop = stopwords.words('english')
def cleanTweet(txt):
# lowercaing
txt = txt.lower()
# tokenization
words = nltk.word_tokenize(txt)
# removing stopwords & mennatizing the words
words = ' '.join([lemma.lemmatize(word) for word in words if word not in (stop)])
text = "".join(words)
# removing non-alphabetic characters
txt = re.sub('[^a-z]',' ',text)
return txt
# ## Applying Clean Tweet Function on Tweets Text
# In[24]:
df['cleaned_tweets'] = df['text'].apply(cleanTweet)
df.head()
# ## Creating Feature & Target Variables
# In[25]:
y = df.target
X=df.cleaned_tweets
# In[26]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20,stratify=y, random_state=0)
# ## TF-IDF Vectorizer - Bi-Gram
# In[27]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,2))
tfidf_train_2 = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_2 = tfidf_vectorizer.transform(X_test)
# ## Multinomial Naive Bayes
# In[28]:
## Model Fitting
mnb_tf = MultinomialNB()
mnb_tf.fit(tfidf_train_2, y_train)
# ## 10-Fold Cross Validation
# In[29]:
from sklearn import model_selection
kfold = model_selection.KFold(n_splits=10)
scoring = 'accuracy'
acc_mnb2 = cross_val_score(estimator = mnb_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring)
acc_mnb2.mean()
# ## Model Prediction Test set
# In[30]:
pred_mnb2 = mnb_tf.predict(tfidf_test_2)
CM=confusion_matrix(y_test,pred_mnb2)
sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])
TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
specificity = TN/(TN+FP)
acc= accuracy_score(y_test, pred_mnb2)
prec = precision_score(y_test, pred_mnb2)
rec = recall_score(y_test, pred_mnb2)
f1 = f1_score(y_test, pred_mnb2)
model_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Bigram',acc, prec,rec,specificity, f1]],
columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
model_results
# ## Passive Aggressive Classifier
# In[31]:
pass_tf = PassiveAggressiveClassifier()
pass_tf.fit(tfidf_train_2, y_train)
# ## 10-Fold Cross Validation
# In[32]:
kfold = model_selection.KFold(n_splits=10)
scoring = 'accuracy'
acc_pass2 = cross_val_score(estimator = pass_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring)
acc_pass2.mean()
# ## Model Prediction
# In[33]:
pred_pass2 = pass_tf.predict(tfidf_test_2)
CM=confusion_matrix(y_test,pred_pass2)
sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])
acc = accuracy_score(y_test, pred_pass2)
prec = precision_score(y_test, pred_pass2)
rec = recall_score(y_test, pred_pass2)
f1 = f1_score(y_test, pred_pass2)
results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Bigram',acc, prec,rec,specificity, f1]],
columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
results = model_results.append(results, ignore_index = True)
results
# ## TF-IDF Vectorizer - Tri Gram
# In[34]:
tfidf_vectorizer_3 = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,3))
tfidf_train_3 = tfidf_vectorizer_3.fit_transform(X_train)
tfidf_test_3 = tfidf_vectorizer_3.transform(X_test)
# ## Multinomial Naive Bayes - Tri Gram
# In[35]:
mnb_tf3 = MultinomialNB()
mnb_tf3.fit(tfidf_train_3, y_train)
# ## 10-fold cross validation
# In[36]:
kfold = model_selection.KFold(n_splits=10)
scoring = 'accuracy'
acc_mnb3 = cross_val_score(estimator = mnb_tf, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring)
acc_mnb3.mean()
# ## Model Prediction
# In[37]:
pred_mnb3 = mnb_tf3.predict(tfidf_test_3)
CM=confusion_matrix(y_test,pred_mnb3)
sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])
acc = accuracy_score(y_test, pred_mnb3)
prec = precision_score(y_test, pred_mnb3)
rec = recall_score(y_test, pred_mnb3)
f1 = f1_score(y_test, pred_mnb3)
mod_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Trigram',acc, prec,rec,specificity, f1]],
columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
results = results.append(mod_results, ignore_index = True)
results
# ## Passive Aggressive Classifier - Tri Gram
# In[38]:
pass_tf3 = PassiveAggressiveClassifier()
pass_tf3.fit(tfidf_train_3, y_train)
## cross validation
kfold = model_selection.KFold(n_splits=10)
scoring = 'accuracy'
acc_pass3 = cross_val_score(estimator = pass_tf3, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring)
acc_pass3.mean()
# In[39]:
pred_pass3 = pass_tf3.predict(tfidf_test_3)
CM=confusion_matrix(y_test,pred_pass3)
sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])
acc = accuracy_score(y_test, pred_pass3)
prec = precision_score(y_test, pred_pass3)
rec = recall_score(y_test, pred_pass3)
f1 = f1_score(y_test, pred_pass3)
mod1_results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Trigram',acc, prec,rec,specificity, f1]],
columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
results = results.append(mod1_results, ignore_index = True)
results
# ## Most Informative Features
# In[40]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
"""
See: https://stackoverflow.com/a/26980472
Identify most important features if given a vectorizer and binary classifier. Set n to the number
of weighted features you would like to show. (Note: current implementation merely prints and does not
return top classes.)
"""
class_labels = classifier.classes_
feature_names = vectorizer.get_feature_names_out()
topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]
for coef, feat in topn_class1:
print(class_labels[0], coef, feat)
print()
for coef, feat in reversed(topn_class2):
print(class_labels[1], coef, feat)
# In[41]:
most_informative_feature_for_binary_classification(tfidf_vectorizer_3, pass_tf3, n=10)
# In[42]:
most_informative_feature_for_binary_classification(tfidf_vectorizer, mnb_tf, n=10)
# ## Sample prediction
# In[43]:
sentences = [
"Just happened a terrible car crash",
"Heard about #earthquake is different cities, stay safe everyone.",
"No I don't like cold!",
"@RosieGray Now in all sincerety do you think the UN would move to Israel if there was a fraction of a chance of being annihilated?"
]
tfidf_trigram = tfidf_vectorizer_3.transform(sentences)
predictions = pass_tf3.predict(tfidf_trigram)
for text, label in zip(sentences, predictions):
if label==1:
target="Disaster Tweet"
print("text:", text, "\nClass:", target)
print()
else:
target="Normal Tweet"
print("text:", text, "\nClass:", target)
print()
# In[44]:
# In[61]:
import gradio as gr
def sample_prediction(inputs):
Accuracy= '97%'
# Split the input text into separate sentences
sentences = inputs.split('\n')
tfidf_trigram = tfidf_vectorizer_3.transform(sentences)
predictions = pass_tf3.predict(tfidf_trigram)
results = [" Disaster Tweet " if prediction == 1 else " Normal Tweet " for prediction in predictions]
return results, Accuracy
iface = gr.Interface(
fn=sample_prediction,
inputs=gr.Textbox(label="Enter Sentences (separate by newline)", type="text"),
outputs=[
gr.Textbox(label="Results"),
gr.Textbox(label="Accuracy")
],
title="Tweet Classifier",
description="Enter multiple sentences (separate by newline) and get predictions."
)
iface.launch(share=True)