Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[6]: | |
| import pandas as pd | |
| import numpy as np | |
| import itertools | |
| import seaborn as sns | |
| import nltk, re, string | |
| from string import punctuation | |
| from nltk.corpus import stopwords | |
| import matplotlib.pyplot as plt | |
| get_ipython().run_line_magic('matplotlib', 'inline') | |
| from sklearn.metrics import accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.model_selection import train_test_split,cross_val_score | |
| #machine learning | |
| from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression | |
| # machine learning | |
| from sklearn.naive_bayes import MultinomialNB,GaussianNB | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| nltk.download('wordnet') | |
| nltk.download('omw-1.4') | |
| # In[20]: | |
| # In[7]: | |
| # In[9]: | |
| import ssl | |
| ssl._create_default_https_context = ssl._create_unverified_context | |
| import nltk | |
| nltk.download() | |
| # In[10]: | |
| df = pd.read_csv('/home/user/disaster_tweets.csv') | |
| df.head() | |
| # In[11]: | |
| df.info() | |
| # ## Target Distribution | |
| # In[12]: | |
| sns.set_style("dark") | |
| sns.countplot(df.target) | |
| # In[13]: | |
| # craeteing new column for storing length of reviews | |
| df['length'] = df['text'].apply(len) | |
| df.head() | |
| # In[14]: | |
| df['length'].plot(bins=50, kind='hist') | |
| # In[15]: | |
| df.length.describe() | |
| # In[16]: | |
| df[df['length'] == 157]['text'].iloc[0] | |
| # In[17]: | |
| df.hist(column='length', by='target', bins=50,figsize=(10,4)) | |
| # In[18]: | |
| stop = set(stopwords.words('english')) | |
| punctuation = list(string.punctuation) | |
| stop.update(punctuation) | |
| # Removing stop words which are unneccesary from headline news | |
| def remove_stopwords(text): | |
| final_text = [] | |
| for i in text.split(): | |
| if i.strip().lower() not in stop: | |
| final_text.append(i.strip()) | |
| return " ".join(final_text) | |
| df_1 = df[df['target']==1] | |
| df_0 = df[df['target']==0] | |
| df_1['text']=df_1['text'].apply(remove_stopwords) | |
| df_0['text']=df_0['text'].apply(remove_stopwords) | |
| # ## Plotting wordcloud of Disaster Tweets | |
| # In[21]: | |
| from wordcloud import WordCloud | |
| plt.figure(figsize = (20,20)) # Text that is Disaster tweets | |
| wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(df_1.text)) | |
| plt.imshow(wc , interpolation = 'bilinear') | |
| # ## Plotting wordcloud of Normal Tweets | |
| # In[22]: | |
| plt.figure(figsize = (20,20)) # Text that is Normal Tweets | |
| wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(df_0.text)) | |
| plt.imshow(wc , interpolation = 'bilinear') | |
| # ## Data Cleaning and Preparation | |
| # In[23]: | |
| from nltk.stem import WordNetLemmatizer | |
| lemma = WordNetLemmatizer() | |
| #creating list of possible stopwords from nltk library | |
| stop = stopwords.words('english') | |
| def cleanTweet(txt): | |
| # lowercaing | |
| txt = txt.lower() | |
| # tokenization | |
| words = nltk.word_tokenize(txt) | |
| # removing stopwords & mennatizing the words | |
| words = ' '.join([lemma.lemmatize(word) for word in words if word not in (stop)]) | |
| text = "".join(words) | |
| # removing non-alphabetic characters | |
| txt = re.sub('[^a-z]',' ',text) | |
| return txt | |
| # ## Applying Clean Tweet Function on Tweets Text | |
| # In[24]: | |
| df['cleaned_tweets'] = df['text'].apply(cleanTweet) | |
| df.head() | |
| # ## Creating Feature & Target Variables | |
| # In[25]: | |
| y = df.target | |
| X=df.cleaned_tweets | |
| # In[26]: | |
| X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20,stratify=y, random_state=0) | |
| # ## TF-IDF Vectorizer - Bi-Gram | |
| # In[27]: | |
| tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,2)) | |
| tfidf_train_2 = tfidf_vectorizer.fit_transform(X_train) | |
| tfidf_test_2 = tfidf_vectorizer.transform(X_test) | |
| # ## Multinomial Naive Bayes | |
| # In[28]: | |
| ## Model Fitting | |
| mnb_tf = MultinomialNB() | |
| mnb_tf.fit(tfidf_train_2, y_train) | |
| # ## 10-Fold Cross Validation | |
| # In[29]: | |
| from sklearn import model_selection | |
| kfold = model_selection.KFold(n_splits=10) | |
| scoring = 'accuracy' | |
| acc_mnb2 = cross_val_score(estimator = mnb_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring) | |
| acc_mnb2.mean() | |
| # ## Model Prediction Test set | |
| # In[30]: | |
| pred_mnb2 = mnb_tf.predict(tfidf_test_2) | |
| CM=confusion_matrix(y_test,pred_mnb2) | |
| sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster']) | |
| TN = CM[0][0] | |
| FN = CM[1][0] | |
| TP = CM[1][1] | |
| FP = CM[0][1] | |
| specificity = TN/(TN+FP) | |
| acc= accuracy_score(y_test, pred_mnb2) | |
| prec = precision_score(y_test, pred_mnb2) | |
| rec = recall_score(y_test, pred_mnb2) | |
| f1 = f1_score(y_test, pred_mnb2) | |
| model_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Bigram',acc, prec,rec,specificity, f1]], | |
| columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score']) | |
| model_results | |
| # ## Passive Aggressive Classifier | |
| # In[31]: | |
| pass_tf = PassiveAggressiveClassifier() | |
| pass_tf.fit(tfidf_train_2, y_train) | |
| # ## 10-Fold Cross Validation | |
| # In[32]: | |
| kfold = model_selection.KFold(n_splits=10) | |
| scoring = 'accuracy' | |
| acc_pass2 = cross_val_score(estimator = pass_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring) | |
| acc_pass2.mean() | |
| # ## Model Prediction | |
| # In[33]: | |
| pred_pass2 = pass_tf.predict(tfidf_test_2) | |
| CM=confusion_matrix(y_test,pred_pass2) | |
| sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster']) | |
| acc = accuracy_score(y_test, pred_pass2) | |
| prec = precision_score(y_test, pred_pass2) | |
| rec = recall_score(y_test, pred_pass2) | |
| f1 = f1_score(y_test, pred_pass2) | |
| results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Bigram',acc, prec,rec,specificity, f1]], | |
| columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score']) | |
| results = model_results.append(results, ignore_index = True) | |
| results | |
| # ## TF-IDF Vectorizer - Tri Gram | |
| # In[34]: | |
| tfidf_vectorizer_3 = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,3)) | |
| tfidf_train_3 = tfidf_vectorizer_3.fit_transform(X_train) | |
| tfidf_test_3 = tfidf_vectorizer_3.transform(X_test) | |
| # ## Multinomial Naive Bayes - Tri Gram | |
| # In[35]: | |
| mnb_tf3 = MultinomialNB() | |
| mnb_tf3.fit(tfidf_train_3, y_train) | |
| # ## 10-fold cross validation | |
| # In[36]: | |
| kfold = model_selection.KFold(n_splits=10) | |
| scoring = 'accuracy' | |
| acc_mnb3 = cross_val_score(estimator = mnb_tf, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring) | |
| acc_mnb3.mean() | |
| # ## Model Prediction | |
| # In[37]: | |
| pred_mnb3 = mnb_tf3.predict(tfidf_test_3) | |
| CM=confusion_matrix(y_test,pred_mnb3) | |
| sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster']) | |
| acc = accuracy_score(y_test, pred_mnb3) | |
| prec = precision_score(y_test, pred_mnb3) | |
| rec = recall_score(y_test, pred_mnb3) | |
| f1 = f1_score(y_test, pred_mnb3) | |
| mod_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Trigram',acc, prec,rec,specificity, f1]], | |
| columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score']) | |
| results = results.append(mod_results, ignore_index = True) | |
| results | |
| # ## Passive Aggressive Classifier - Tri Gram | |
| # In[38]: | |
| pass_tf3 = PassiveAggressiveClassifier() | |
| pass_tf3.fit(tfidf_train_3, y_train) | |
| ## cross validation | |
| kfold = model_selection.KFold(n_splits=10) | |
| scoring = 'accuracy' | |
| acc_pass3 = cross_val_score(estimator = pass_tf3, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring) | |
| acc_pass3.mean() | |
| # In[39]: | |
| pred_pass3 = pass_tf3.predict(tfidf_test_3) | |
| CM=confusion_matrix(y_test,pred_pass3) | |
| sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster']) | |
| acc = accuracy_score(y_test, pred_pass3) | |
| prec = precision_score(y_test, pred_pass3) | |
| rec = recall_score(y_test, pred_pass3) | |
| f1 = f1_score(y_test, pred_pass3) | |
| mod1_results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Trigram',acc, prec,rec,specificity, f1]], | |
| columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score']) | |
| results = results.append(mod1_results, ignore_index = True) | |
| results | |
| # ## Most Informative Features | |
| # In[40]: | |
| def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100): | |
| """ | |
| See: https://stackoverflow.com/a/26980472 | |
| Identify most important features if given a vectorizer and binary classifier. Set n to the number | |
| of weighted features you would like to show. (Note: current implementation merely prints and does not | |
| return top classes.) | |
| """ | |
| class_labels = classifier.classes_ | |
| feature_names = vectorizer.get_feature_names_out() | |
| topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n] | |
| topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:] | |
| for coef, feat in topn_class1: | |
| print(class_labels[0], coef, feat) | |
| print() | |
| for coef, feat in reversed(topn_class2): | |
| print(class_labels[1], coef, feat) | |
| # In[41]: | |
| most_informative_feature_for_binary_classification(tfidf_vectorizer_3, pass_tf3, n=10) | |
| # In[42]: | |
| most_informative_feature_for_binary_classification(tfidf_vectorizer, mnb_tf, n=10) | |
| # ## Sample prediction | |
| # In[43]: | |
| sentences = [ | |
| "Just happened a terrible car crash", | |
| "Heard about #earthquake is different cities, stay safe everyone.", | |
| "No I don't like cold!", | |
| "@RosieGray Now in all sincerety do you think the UN would move to Israel if there was a fraction of a chance of being annihilated?" | |
| ] | |
| tfidf_trigram = tfidf_vectorizer_3.transform(sentences) | |
| predictions = pass_tf3.predict(tfidf_trigram) | |
| for text, label in zip(sentences, predictions): | |
| if label==1: | |
| target="Disaster Tweet" | |
| print("text:", text, "\nClass:", target) | |
| print() | |
| else: | |
| target="Normal Tweet" | |
| print("text:", text, "\nClass:", target) | |
| print() | |
| # In[44]: | |
| # In[61]: | |
| import gradio as gr | |
| def sample_prediction(inputs): | |
| Accuracy= '97%' | |
| # Split the input text into separate sentences | |
| sentences = inputs.split('\n') | |
| tfidf_trigram = tfidf_vectorizer_3.transform(sentences) | |
| predictions = pass_tf3.predict(tfidf_trigram) | |
| results = [" Disaster Tweet " if prediction == 1 else " Normal Tweet " for prediction in predictions] | |
| return results, Accuracy | |
| iface = gr.Interface( | |
| fn=sample_prediction, | |
| inputs=gr.Textbox(label="Enter Sentences (separate by newline)", type="text"), | |
| outputs=[ | |
| gr.Textbox(label="Results"), | |
| gr.Textbox(label="Accuracy") | |
| ], | |
| title="Tweet Classifier", | |
| description="Enter multiple sentences (separate by newline) and get predictions." | |
| ) | |
| iface.launch(share=True) | |