coverylanguagedetection1 / language_detection.py
Felixogunwale's picture
Upload 18 files
a63752b
raw
history blame contribute delete
No virus
2.16 kB
#import all the required libraries.
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import csv
warnings.simplefilter("ignore")
import sklearn
# Loading the dataset
data = pd.read_csv("Language_Detection.csv")
# value count for each language
data["Language"].value_counts()
# separating the independent and dependant features
X = data["Text"]
y = data["Language"]
# converting categorical variables to numerical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
# creating a list for appending the preprocessed text
data_list = []
# iterating through all the text
for text in X:
# removing the symbols and numbers
text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
text = re.sub(r'[[]]', ' ', text)
# converting the text to lower case
text = text.lower()
# appending to data_list
data_list.append(text)
# creating bag of words using countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(data_list).toarray()
X.shape
#train test splitting
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
#model creation and prediction
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)
# prediction
y_pred = model.predict(x_test)
# model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy is :",ac)
# visualising the confusion matrix
plt.figure(figsize=(15,10))
sns.heatmap(cm, annot = True)
plt.show()
# function for predicting language
def predict(text):
x = cv.transform([text]).toarray()
lang = model.predict(x)
lang = le.inverse_transform(lang)
print("The langauge is in",lang[0])
# English
#prediction("load the texts")
#with open('processed/text_detected/language_detected.txt', 'w', newline="") as file:
# csv.writer(file, delimiter=" ").writerows(prediction)