Language_Identifier / data_cleaning.py
hassaanik's picture
Upload 8 files
24bf069 verified
from data_analysis import df
from nltk.tokenize import word_tokenize
import re
import pandas as pd
import nltk
#Removing Duplicates
# df = df.drop_duplicates(subset='Text')
# df = df.reset_index(drop=True)
nltk.download('punkt')
# Initialize the set of non-alphanumeric characters to remove
nonalphanumeric = ['\'', '.', ',', '\"', ':', ';', '!', '@', '#', '$', '%', '^', '&',
'*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '?',
'/', '>', '<', '|', ' ']
def clean_text(text):
"""
Function to clean and preprocess text data.
"""
# Tokenize the text using spaCy
tokens = word_tokenize(text)
# Remove non-alphanumeric characters
words = [word.lower() for word in tokens if word not in nonalphanumeric]
# Join the lemmatized words back into a single string
cleaned_text = " ".join(words)
return cleaned_text
def remove_english(text):
"""
function that takes text as input and returns text without english words
"""
pat = "[a-zA-Z]+"
text = re.sub(pat, "", text)
return text
#applying clean_text function to all rows in 'Text' column
# df['clean_text'] = df['Text'].apply(clean_text)
# #Removing English from Chinese text
# df_Chinese = df[df['language']=='Chinese'] # Chinese data in dataset
# clean_text = df.loc[df.language=='Chinese']['clean_text']
# clean_text = clean_text.apply(remove_english) # removing English words
# df_Chinese.loc[:,'clean_text'] = clean_text
# # Concatenate the original DataFrame with the cleaned Chinese text DataFrame
# df = pd.concat([df, df_Chinese], axis=0, ignore_index=True)
# # Drop rows with 'Chinese' language from the original DataFrame
# df = df[~df['language'].isin(['Chinese'])].reset_index(drop=True)
# # shuffling dataframe and resetting index
# df = df.sample(frac=1).reset_index(drop=True)