mapping_bert_topic_copy / preprocess_function.py
harshithakr's picture
Update preprocess_function.py
c140769
import pandas as pd
import re
from nltk import ngrams
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words_2 = ('show','international','exhibition','trade','fair','global','conference','world',
'expo','event','wellknown','popular','new', 'together',
'latest','offer','trend','sector','exhibitor','th','one','like','also','held','well','etc','u','bb',
'provide', 'provides', 'provide','day','attendee','year', 'best','top','management',
'brings','bring','event','topic','visitor','buyer','brand','take','u','national','great','come')
stop_words = stop_words.union(stop_words_2)
list_location = []
for col in ['name','capital','region','subregion']:#countries
list_location.extend(list(set(pd.read_csv('countries.csv')[col])))
list_location.extend(list(set(pd.read_csv('states.csv')['name'])))
list_location.extend(list(set(pd.read_csv('cities.csv')['name'])))
list_location.extend(list(set(pd.read_csv('zones.csv')['Zone'])))
locations_removal = set([x.lower() for x in list_location if not pd.isna(x)])
locations_removal.discard('nan')
stop_words_bert = stop_words.union(locations_removal).union(stop_words_2)
def preprocess_text(keyword):
keyword = ' '.join([w for w in word_tokenize(keyword) if not w.lower() in stop_words])
keyword = keyword.replace('/', ' ')
keyword = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9\)]+$", " ", keyword).strip()
keyword = keyword.replace('_', ' ')
keyword = keyword.replace('&', ' ').strip()
keyword = keyword.encode('ascii', 'ignore').decode('utf-8').strip().lower()
keyword = re.sub(r'[^a-zA-Z\s]', '', keyword)
words = word_tokenize(keyword)
words = [word for word in words if word not in stop_words]
lemmatizer = WordNetLemmatizer()
words = list(set([lemmatizer.lemmatize(word) for word in words]))
words = [word for word in words if word not in stop_words]
processed_text = ' '.join(words)
processed_text = re.sub(r'\b\w*([a-zA-Z])\1{10,}\w*\b', '', processed_text)
return processed_text
def bert_preprocess(keyword):
# Remove abbreviations
keyword = re.sub(r"\b[A-Z\.]{2,}\b", ' ', keyword)
# Convert to lowercase
keyword = keyword.lower()
# Tokenize and remove stop words
keyword = ' '.join([w for w in word_tokenize(keyword) if re.sub(r'[^\w\s]', '', w.lower()) not in stop_words_bert])
# Remove special characters, unwanted patterns, and symbols
keyword = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9\)]+$", " ", keyword)
keyword = re.sub(r'[^a-zA-Z\s]', ' ', keyword)
# Clean up and lemmatize words
lemmatizer = WordNetLemmatizer()
words = [w for w in word_tokenize(keyword)]
words = [lemmatizer.lemmatize(word) for word in words]
# Remove repeated characters
processed_text = re.sub(r'\b\w*([a-zA-Z])\1{10,}\w*\b', '', ' '.join(words))
# Join words and remove unnecessary spaces
processed_text = ' '.join(processed_text.split())
return processed_text
def lam_list(list_words):
list_words = [x.strip().lower() for x in list_words]
lemmatizer = WordNetLemmatizer()
list_words_v = [lemmatizer.lemmatize(word,pos='v') for word in list_words]
list_words_n = [lemmatizer.lemmatize(word,pos='n') for word in list_words]
return list_words_v, list_words_n