File size: 3,361 Bytes
cc400c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d713b3d
cc400c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# -*- coding: utf-8 -*-
"""First_Text_Classification.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1sdLss09e3OxYVoeK3oBA6qrUSj_iOxp-

<h3 align = "center">Importing Libraries</h3>
"""

import numpy as np
import pandas as pd

"""<h3 align = "center">Importing Dataset</h3>"""

data = pd.read_csv("spam.csv", encoding = "ISO-8859-1")

"""<h3 align = "center">Preliminary Data Checks</h3>"""

data.head()

data.isnull().sum()

data.shape

data['v1'].value_counts()

data.info()

"""<h3 align = "center">Putting the Length of Characters of each row in a column.</h3>"""

data["Unnamed: 2"] = data["v2"].str.len()

"""<h3 align = "center">Visualising Length of Characters for each category!</h3>"""


"""<h5>It is evident from the above plot that spam texts are usually longer in length!</h5>

<h3 align = "center">Defining Variables</h3>
"""

X = data["v2"]
y = data["v1"]

"""<h3 align = "center">Train Test Split</h3>"""

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

"""<h3 align = "center">Vecrorizing Words into Matrix</h3>"""

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)

X_train_counts

X_train.shape

X_train_counts.shape

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_train_tfidf.shape

"""<h3 align = "center">Using TDIF Vectorizer for optimum vectorization!</h3>"""

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)

X_train_tfidf.shape

"""<h3 align = "center">Creating Model</h3>"""

from sklearn.svm import LinearSVC
clf = LinearSVC()

clf.fit(X_train_tfidf,y_train)

"""<h3 align = "center">Creating Pipeline</h3>"""

from sklearn.pipeline import Pipeline

text_clf = Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())])

text_clf.fit(X_train,y_train)

predictions = text_clf.predict(X_test)

X_test

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

print(confusion_matrix(y_test,predictions))

print(classification_report(y_test,predictions))

"""<h3 align = "center">Accuracy Score</h3>"""

print(accuracy_score(y_test,predictions))

"""<h3 align = "center">Predictions </h3>"""

text_clf.predict(["Hi how are you doing today?"])

text_clf.predict(["Congratulations! You are selected for a free vouchar worth $500"])

"""<h3 align = "center">Creating User Interface!</h3>"""

import gradio as gr

def first_nlp_spam_detector(text):
  list = []
  list.append(text)
  arr =  text_clf.predict(list)
  if arr[0] == 'ham':
    return "Your Text is a Legitimate One!"
  else:
    return "Beware of such text messages, It\'s a Spam! "

interface = gr.Interface(first_nlp_spam_detector,inputs = gr.Textbox(lines=2, placeholder="Enter your Text Here.....!", show_label = False),
                         outputs = gr.Label(value = "Predicting the Text Classification..!"),description = "Predicting Text Legitimacy!")

first_nlp_spam_detector("Congratulations! You are selected for a free vouchar worth $500")

interface.launch()