Spaces:

Prashun08
/

Text_Classification

Sleeping

App Files Files Community

Text_Classification / app.py

Prashun08

Application File

cc400c1 over 1 year ago

raw history blame

No virus

3.55 kB

	# -- coding: utf-8 --
	"""First_Text_Classification.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1sdLss09e3OxYVoeK3oBA6qrUSj_iOxp-

	<h3 align = "center">Importing Libraries</h3>
	"""

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns

	"""<h3 align = "center">Importing Dataset</h3>"""

	data = pd.read_csv("/content/spam.csv", encoding = "ISO-8859-1")

	"""<h3 align = "center">Preliminary Data Checks</h3>"""

	data.head()

	data.isnull().sum()

	data.shape

	data['v1'].value_counts()

	data.info()

	"""<h3 align = "center">Putting the Length of Characters of each row in a column.</h3>"""

	data["Unnamed: 2"] = data["v2"].str.len()

	"""<h3 align = "center">Visualising Length of Characters for each category!</h3>"""

	plt.figure(figsize = (12,8))
	sns.displot(data = data ,x = "Unnamed: 2", hue = "v1",log_scale = True)

	"""<h5>It is evident from the above plot that spam texts are usually longer in length!</h5>

	<h3 align = "center">Defining Variables</h3>
	"""

	X = data["v2"]
	y = data["v1"]

	"""<h3 align = "center">Train Test Split</h3>"""

	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

	"""<h3 align = "center">Vecrorizing Words into Matrix</h3>"""

	from sklearn.feature_extraction.text import CountVectorizer
	count_vect = CountVectorizer()

	X_train_counts = count_vect.fit_transform(X_train)

	X_train_counts

	X_train.shape

	X_train_counts.shape

	from sklearn.feature_extraction.text import TfidfTransformer
	tfidf_transformer = TfidfTransformer()

	X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

	X_train_tfidf.shape

	"""<h3 align = "center">Using TDIF Vectorizer for optimum vectorization!</h3>"""

	from sklearn.feature_extraction.text import TfidfVectorizer
	vectorizer = TfidfVectorizer()

	X_train_tfidf = vectorizer.fit_transform(X_train)

	X_train_tfidf.shape

	"""<h3 align = "center">Creating Model</h3>"""

	from sklearn.svm import LinearSVC
	clf = LinearSVC()

	clf.fit(X_train_tfidf,y_train)

	"""<h3 align = "center">Creating Pipeline</h3>"""

	from sklearn.pipeline import Pipeline

	text_clf = Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())])

	text_clf.fit(X_train,y_train)

	predictions = text_clf.predict(X_test)

	X_test

	from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

	print(confusion_matrix(y_test,predictions))

	print(classification_report(y_test,predictions))

	"""<h3 align = "center">Accuracy Score</h3>"""

	print(accuracy_score(y_test,predictions))

	"""<h3 align = "center">Predictions </h3>"""

	text_clf.predict(["Hi how are you doing today?"])

	text_clf.predict(["Congratulations! You are selected for a free vouchar worth $500"])

	"""<h3 align = "center">Creating User Interface!</h3>"""

	! pip install gradio

	import gradio as gr

	def first_nlp_spam_detector(text):
	list = []
	list.append(text)
	arr = text_clf.predict(list)
	if arr[0] == 'ham':
	return "Your Text is a Legitimate One!"
	else:
	return "Beware of such text messages, It\'s a Spam! "

	interface = gr.Interface(first_nlp_spam_detector,inputs = gr.Textbox(lines=2, placeholder="Enter your Text Here.....!", show_label = False),
	outputs = gr.Label(value = "Predicting the Text Classification..!"),description = "Predicting Text Legitimacy!")

	first_nlp_spam_detector("Congratulations! You are selected for a free vouchar worth $500")

	interface.launch()