Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""First_Text_Classification.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1sdLss09e3OxYVoeK3oBA6qrUSj_iOxp- | |
<h3 align = "center">Importing Libraries</h3> | |
""" | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
"""<h3 align = "center">Importing Dataset</h3>""" | |
data = pd.read_csv("/content/spam.csv", encoding = "ISO-8859-1") | |
"""<h3 align = "center">Preliminary Data Checks</h3>""" | |
data.head() | |
data.isnull().sum() | |
data.shape | |
data['v1'].value_counts() | |
data.info() | |
"""<h3 align = "center">Putting the Length of Characters of each row in a column.</h3>""" | |
data["Unnamed: 2"] = data["v2"].str.len() | |
"""<h3 align = "center">Visualising Length of Characters for each category!</h3>""" | |
plt.figure(figsize = (12,8)) | |
sns.displot(data = data ,x = "Unnamed: 2", hue = "v1",log_scale = True) | |
"""<h5>It is evident from the above plot that spam texts are usually longer in length!</h5> | |
<h3 align = "center">Defining Variables</h3> | |
""" | |
X = data["v2"] | |
y = data["v1"] | |
"""<h3 align = "center">Train Test Split</h3>""" | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) | |
"""<h3 align = "center">Vecrorizing Words into Matrix</h3>""" | |
from sklearn.feature_extraction.text import CountVectorizer | |
count_vect = CountVectorizer() | |
X_train_counts = count_vect.fit_transform(X_train) | |
X_train_counts | |
X_train.shape | |
X_train_counts.shape | |
from sklearn.feature_extraction.text import TfidfTransformer | |
tfidf_transformer = TfidfTransformer() | |
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) | |
X_train_tfidf.shape | |
"""<h3 align = "center">Using TDIF Vectorizer for optimum vectorization!</h3>""" | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
vectorizer = TfidfVectorizer() | |
X_train_tfidf = vectorizer.fit_transform(X_train) | |
X_train_tfidf.shape | |
"""<h3 align = "center">Creating Model</h3>""" | |
from sklearn.svm import LinearSVC | |
clf = LinearSVC() | |
clf.fit(X_train_tfidf,y_train) | |
"""<h3 align = "center">Creating Pipeline</h3>""" | |
from sklearn.pipeline import Pipeline | |
text_clf = Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())]) | |
text_clf.fit(X_train,y_train) | |
predictions = text_clf.predict(X_test) | |
X_test | |
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score | |
print(confusion_matrix(y_test,predictions)) | |
print(classification_report(y_test,predictions)) | |
"""<h3 align = "center">Accuracy Score</h3>""" | |
print(accuracy_score(y_test,predictions)) | |
"""<h3 align = "center">Predictions </h3>""" | |
text_clf.predict(["Hi how are you doing today?"]) | |
text_clf.predict(["Congratulations! You are selected for a free vouchar worth $500"]) | |
"""<h3 align = "center">Creating User Interface!</h3>""" | |
! pip install gradio | |
import gradio as gr | |
def first_nlp_spam_detector(text): | |
list = [] | |
list.append(text) | |
arr = text_clf.predict(list) | |
if arr[0] == 'ham': | |
return "Your Text is a Legitimate One!" | |
else: | |
return "Beware of such text messages, It\'s a Spam! " | |
interface = gr.Interface(first_nlp_spam_detector,inputs = gr.Textbox(lines=2, placeholder="Enter your Text Here.....!", show_label = False), | |
outputs = gr.Label(value = "Predicting the Text Classification..!"),description = "Predicting Text Legitimacy!") | |
first_nlp_spam_detector("Congratulations! You are selected for a free vouchar worth $500") | |
interface.launch() |