Spaces:

vikranth1111
/

caps

Runtime error

App Files Files Community

caps / app.py

vikranth1111

Update app.py

d563dfb about 2 years ago

raw

history blame contribute delete

10.9 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[6]:


	import pandas as pd
	import numpy as np
	import itertools
	import seaborn as sns
	import nltk, re, string
	from string import punctuation
	from nltk.corpus import stopwords
	import matplotlib.pyplot as plt
	get_ipython().run_line_magic('matplotlib', 'inline')
	from sklearn.metrics import accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import train_test_split,cross_val_score
	#machine learning
	from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression
	# machine learning
	from sklearn.naive_bayes import MultinomialNB,GaussianNB
	nltk.download('stopwords')
	nltk.download('punkt')
	nltk.download('wordnet')
	nltk.download('omw-1.4')


	# In[20]:





	# In[7]:



	# In[9]:


	import ssl
	ssl._create_default_https_context = ssl._create_unverified_context

	import nltk
	nltk.download()


	# In[10]:


	df = pd.read_csv('/home/user/disaster_tweets.csv')
	df.head()


	# In[11]:


	df.info()


	# ## Target Distribution

	# In[12]:


	sns.set_style("dark")
	sns.countplot(df.target)


	# In[13]:


	# craeteing new column for storing length of reviews
	df['length'] = df['text'].apply(len)
	df.head()


	# In[14]:


	df['length'].plot(bins=50, kind='hist')


	# In[15]:


	df.length.describe()


	# In[16]:


	df[df['length'] == 157]['text'].iloc[0]


	# In[17]:


	df.hist(column='length', by='target', bins=50,figsize=(10,4))


	# In[18]:


	stop = set(stopwords.words('english'))
	punctuation = list(string.punctuation)
	stop.update(punctuation)

	# Removing stop words which are unneccesary from headline news
	def remove_stopwords(text):
	final_text = []
	for i in text.split():
	if i.strip().lower() not in stop:
	final_text.append(i.strip())
	return " ".join(final_text)

	df_1 = df[df['target']==1]
	df_0 = df[df['target']==0]
	df_1['text']=df_1['text'].apply(remove_stopwords)
	df_0['text']=df_0['text'].apply(remove_stopwords)


	# ## Plotting wordcloud of Disaster Tweets

	# In[21]:


	from wordcloud import WordCloud
	plt.figure(figsize = (20,20)) # Text that is Disaster tweets
	wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(df_1.text))
	plt.imshow(wc , interpolation = 'bilinear')


	# ## Plotting wordcloud of Normal Tweets

	# In[22]:


	plt.figure(figsize = (20,20)) # Text that is Normal Tweets
	wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(df_0.text))
	plt.imshow(wc , interpolation = 'bilinear')


	# ## Data Cleaning and Preparation

	# In[23]:


	from nltk.stem import WordNetLemmatizer
	lemma = WordNetLemmatizer()
	#creating list of possible stopwords from nltk library
	stop = stopwords.words('english')

	def cleanTweet(txt):
	# lowercaing
	txt = txt.lower()
	# tokenization
	words = nltk.word_tokenize(txt)
	# removing stopwords & mennatizing the words
	words = ' '.join([lemma.lemmatize(word) for word in words if word not in (stop)])
	text = "".join(words)
	# removing non-alphabetic characters
	txt = re.sub('[^a-z]',' ',text)
	return txt


	# ## Applying Clean Tweet Function on Tweets Text

	# In[24]:


	df['cleaned_tweets'] = df['text'].apply(cleanTweet)
	df.head()


	# ## Creating Feature & Target Variables

	# In[25]:


	y = df.target
	X=df.cleaned_tweets


	# In[26]:


	X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20,stratify=y, random_state=0)


	# ## TF-IDF Vectorizer - Bi-Gram

	# In[27]:


	tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,2))
	tfidf_train_2 = tfidf_vectorizer.fit_transform(X_train)
	tfidf_test_2 = tfidf_vectorizer.transform(X_test)


	# ## Multinomial Naive Bayes

	# In[28]:


	## Model Fitting
	mnb_tf = MultinomialNB()
	mnb_tf.fit(tfidf_train_2, y_train)



	# ## 10-Fold Cross Validation

	# In[29]:


	from sklearn import model_selection

	kfold = model_selection.KFold(n_splits=10)
	scoring = 'accuracy'

	acc_mnb2 = cross_val_score(estimator = mnb_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring)
	acc_mnb2.mean()


	# ## Model Prediction Test set

	# In[30]:


	pred_mnb2 = mnb_tf.predict(tfidf_test_2)
	CM=confusion_matrix(y_test,pred_mnb2)
	sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])

	TN = CM[0][0]
	FN = CM[1][0]
	TP = CM[1][1]
	FP = CM[0][1]
	specificity = TN/(TN+FP)

	acc= accuracy_score(y_test, pred_mnb2)

	prec = precision_score(y_test, pred_mnb2)
	rec = recall_score(y_test, pred_mnb2)
	f1 = f1_score(y_test, pred_mnb2)


	model_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Bigram',acc, prec,rec,specificity, f1]],
	columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])

	model_results


	# ## Passive Aggressive Classifier

	# In[31]:


	pass_tf = PassiveAggressiveClassifier()
	pass_tf.fit(tfidf_train_2, y_train)


	# ## 10-Fold Cross Validation

	# In[32]:


	kfold = model_selection.KFold(n_splits=10)
	scoring = 'accuracy'

	acc_pass2 = cross_val_score(estimator = pass_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring)
	acc_pass2.mean()


	# ## Model Prediction

	# In[33]:


	pred_pass2 = pass_tf.predict(tfidf_test_2)
	CM=confusion_matrix(y_test,pred_pass2)
	sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])

	acc = accuracy_score(y_test, pred_pass2)
	prec = precision_score(y_test, pred_pass2)
	rec = recall_score(y_test, pred_pass2)
	f1 = f1_score(y_test, pred_pass2)

	results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Bigram',acc, prec,rec,specificity, f1]],
	columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
	results = model_results.append(results, ignore_index = True)
	results


	# ## TF-IDF Vectorizer - Tri Gram

	# In[34]:


	tfidf_vectorizer_3 = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,3))
	tfidf_train_3 = tfidf_vectorizer_3.fit_transform(X_train)
	tfidf_test_3 = tfidf_vectorizer_3.transform(X_test)


	# ## Multinomial Naive Bayes - Tri Gram

	# In[35]:


	mnb_tf3 = MultinomialNB()
	mnb_tf3.fit(tfidf_train_3, y_train)


	# ## 10-fold cross validation

	# In[36]:


	kfold = model_selection.KFold(n_splits=10)
	scoring = 'accuracy'

	acc_mnb3 = cross_val_score(estimator = mnb_tf, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring)
	acc_mnb3.mean()


	# ## Model Prediction

	# In[37]:


	pred_mnb3 = mnb_tf3.predict(tfidf_test_3)
	CM=confusion_matrix(y_test,pred_mnb3)
	sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])

	acc = accuracy_score(y_test, pred_mnb3)
	prec = precision_score(y_test, pred_mnb3)
	rec = recall_score(y_test, pred_mnb3)
	f1 = f1_score(y_test, pred_mnb3)

	mod_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Trigram',acc, prec,rec,specificity, f1]],
	columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
	results = results.append(mod_results, ignore_index = True)
	results


	# ## Passive Aggressive Classifier - Tri Gram

	# In[38]:


	pass_tf3 = PassiveAggressiveClassifier()
	pass_tf3.fit(tfidf_train_3, y_train)

	## cross validation
	kfold = model_selection.KFold(n_splits=10)
	scoring = 'accuracy'

	acc_pass3 = cross_val_score(estimator = pass_tf3, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring)
	acc_pass3.mean()


	# In[39]:


	pred_pass3 = pass_tf3.predict(tfidf_test_3)
	CM=confusion_matrix(y_test,pred_pass3)
	sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])

	acc = accuracy_score(y_test, pred_pass3)
	prec = precision_score(y_test, pred_pass3)
	rec = recall_score(y_test, pred_pass3)
	f1 = f1_score(y_test, pred_pass3)

	mod1_results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Trigram',acc, prec,rec,specificity, f1]],
	columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
	results = results.append(mod1_results, ignore_index = True)
	results


	# ## Most Informative Features

	# In[40]:


	def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
	"""
	See: https://stackoverflow.com/a/26980472

	Identify most important features if given a vectorizer and binary classifier. Set n to the number
	of weighted features you would like to show. (Note: current implementation merely prints and does not
	return top classes.)
	"""

	class_labels = classifier.classes_
	feature_names = vectorizer.get_feature_names_out()
	topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
	topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

	for coef, feat in topn_class1:
	print(class_labels[0], coef, feat)

	print()

	for coef, feat in reversed(topn_class2):
	print(class_labels[1], coef, feat)


	# In[41]:


	most_informative_feature_for_binary_classification(tfidf_vectorizer_3, pass_tf3, n=10)


	# In[42]:


	most_informative_feature_for_binary_classification(tfidf_vectorizer, mnb_tf, n=10)


	# ## Sample prediction

	# In[43]:


	sentences = [
	"Just happened a terrible car crash",
	"Heard about #earthquake is different cities, stay safe everyone.",
	"No I don't like cold!",
	"@RosieGray Now in all sincerety do you think the UN would move to Israel if there was a fraction of a chance of being annihilated?"
	]

	tfidf_trigram = tfidf_vectorizer_3.transform(sentences)


	predictions = pass_tf3.predict(tfidf_trigram)

	for text, label in zip(sentences, predictions):
	if label==1:
	target="Disaster Tweet"
	print("text:", text, "\nClass:", target)
	print()
	else:
	target="Normal Tweet"
	print("text:", text, "\nClass:", target)
	print()


	# In[44]:




	# In[61]:


	import gradio as gr



	def sample_prediction(inputs):
	Accuracy= '97%'

	# Split the input text into separate sentences

	sentences = inputs.split('\n')
	tfidf_trigram = tfidf_vectorizer_3.transform(sentences)
	predictions = pass_tf3.predict(tfidf_trigram)
	results = [" Disaster Tweet " if prediction == 1 else " Normal Tweet " for prediction in predictions]
	return results, Accuracy

	iface = gr.Interface(
	fn=sample_prediction,

	inputs=gr.Textbox(label="Enter Sentences (separate by newline)", type="text"),
	outputs=[
	gr.Textbox(label="Results"),
	gr.Textbox(label="Accuracy")
	],
	title="Tweet Classifier",
	description="Enter multiple sentences (separate by newline) and get predictions."
	)

	iface.launch(share=True)