Spaces:

Felixogunwale
/

coverylanguagedetection1

Runtime error

App Files Files Community

coverylanguagedetection1 / language_detection.py

Felixogunwale

Upload 18 files

a63752b over 1 year ago

raw

history blame contribute delete

No virus

2.16 kB

	#import all the required libraries.
	import pandas as pd
	import numpy as np
	import re
	import seaborn as sns
	import matplotlib.pyplot as plt
	import warnings
	import csv
	warnings.simplefilter("ignore")
	import sklearn


	# Loading the dataset
	data = pd.read_csv("Language_Detection.csv")
	# value count for each language
	data["Language"].value_counts()

	# separating the independent and dependant features
	X = data["Text"]
	y = data["Language"]
	# converting categorical variables to numerical
	from sklearn.preprocessing import LabelEncoder
	le = LabelEncoder()
	y = le.fit_transform(y)

	# creating a list for appending the preprocessed text
	data_list = []
	# iterating through all the text
	for text in X:
	# removing the symbols and numbers
	text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
	text = re.sub(r'[[]]', ' ', text)
	# converting the text to lower case
	text = text.lower()
	# appending to data_list
	data_list.append(text)

	# creating bag of words using countvectorizer
	from sklearn.feature_extraction.text import CountVectorizer
	cv = CountVectorizer()
	X = cv.fit_transform(data_list).toarray()
	X.shape

	#train test splitting
	from sklearn.model_selection import train_test_split
	x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

	#model creation and prediction
	from sklearn.naive_bayes import MultinomialNB
	model = MultinomialNB()
	model.fit(x_train, y_train)

	# prediction
	y_pred = model.predict(x_test)

	# model evaluation
	from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
	ac = accuracy_score(y_test, y_pred)
	cm = confusion_matrix(y_test, y_pred)

	print("Accuracy is :",ac)

	# visualising the confusion matrix
	plt.figure(figsize=(15,10))
	sns.heatmap(cm, annot = True)
	plt.show()

	# function for predicting language
	def predict(text):
	x = cv.transform([text]).toarray()
	lang = model.predict(x)
	lang = le.inverse_transform(lang)
	print("The langauge is in",lang[0])

	# English
	#prediction("load the texts")

	#with open('processed/text_detected/language_detected.txt', 'w', newline="") as file:
	# csv.writer(file, delimiter=" ").writerows(prediction)