Spaces:

seanpedrickcase
/

data_text_search

Sleeping

data_text_search / search_funcs /clean_funcs.py

Sean-Case

Initial commit

a9c2120 almost 2 years ago

11.4 kB

	# ## Some functions to clean text

	# ### Some other suggested cleaning approaches
	#
	# #### From here: https://shravan-kuchkula.github.io/topic-modeling/#interactive-plot-showing-results-of-k-means-clustering-lda-topic-modeling-and-sentiment-analysis
	#
	# - remove_hyphens
	# - tokenize_text
	# - remove_special_characters
	# - convert to lower case
	# - remove stopwords
	# - lemmatize the token
	# - remove short tokens
	# - keep only words in wordnet
	# - I ADDED ON - creating custom stopwords list

	# +
	# Create a custom stop words list
	import nltk
	import re
	import string
	from nltk.stem import WordNetLemmatizer
	from nltk.stem import PorterStemmer
	from nltk.corpus import wordnet as wn
	from nltk import word_tokenize

	# Add calendar months onto stop words
	import calendar
	from tqdm import tqdm
	import gradio as gr

	stemmer = PorterStemmer()


	nltk.download('stopwords')
	nltk.download('wordnet')

	#nltk.download('words')
	#nltk.download('names')

	#nltk.corpus.words.words('en')

	#from sklearn.feature_extraction import text
	# Adding common names to stopwords

	all_names = [x.lower() for x in list(nltk.corpus.names.words())]

	# Adding custom words to the stopwords
	custom_words = []
	my_stop_words = custom_words


	cal_month = (list(calendar.month_name))
	cal_month = [x.lower() for x in cal_month]

	# Remove blanks
	cal_month = [i for i in cal_month if i]
	#print(cal_month)
	custom_words.extend(cal_month)

	#my_stop_words = frozenset(text.ENGLISH_STOP_WORDS.union(custom_words).union(all_names))
	#custom_stopwords = my_stop_words
	# -

	# #### Some of my cleaning functions
	'''
	# +
	# Remove all html elements from the text. Inspired by this: https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

	def remove_email_start(text):
	cleanr = re.compile('.importance:\|.subject:')
	cleantext = re.sub(cleanr, '', text)
	return cleantext

	def remove_email_end(text):
	cleanr = re.compile('kind regards.\|many thanks.\|sincerely.*')
	cleantext = re.sub(cleanr, '', text)
	return cleantext

	def cleanhtml(text):
	cleanr = re.compile('<.*?>\|&([a-z0-9]+\|#[0-9]{1,6}\|#x[0-9a-f]{1,6});\|\xa0')
	cleantext = re.sub(cleanr, '', text)
	return cleantext

	## The above doesn't work when there is no > at the end of the string to match the initial <. Trying this: <[^>]+> but needs work: https://stackoverflow.com/questions/2013124/regex-matching-up-to-the-first-occurrence-of-a-character

	# Remove all email addresses and numbers from the text

	def cleanemail(text):
	cleanr = re.compile('\S@\S\s?\|\xa0')
	cleantext = re.sub(cleanr, '', text)
	return cleantext

	def cleannum(text):
	cleanr = re.compile(r'[0-9]+')
	cleantext = re.sub(cleanr, '', text)
	return cleantext

	def cleanpostcode(text):
	cleanr = re.compile(r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})\|((GIR ?0A{2})\b$)\|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)\|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)')
	cleantext = re.sub(cleanr, '', text)
	return cleantext

	def cleanwarning(text):
	cleanr = re.compile('caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.')
	cleantext = re.sub(cleanr, '', text)
	return cleantext


	# -

	def initial_clean(texts):
	clean_texts = []
	for text in texts:
	text = remove_email_start(text)
	text = remove_email_end(text)
	text = cleanpostcode(text)
	text = remove_hyphens(text)
	text = cleanhtml(text)
	text = cleanemail(text)
	#text = cleannum(text)
	clean_texts.append(text)
	return clean_texts
	'''
	# Pre-compiling the regular expressions for efficiency
	email_start_pattern = re.compile('.importance:\|.subject:')
	email_end_pattern = re.compile('kind regards.\|many thanks.\|sincerely.*')
	html_pattern = re.compile('<.*?>\|&([a-z0-9]+\|#[0-9]{1,6}\|#x[0-9a-f]{1,6});\|\xa0')
	email_pattern = re.compile('\S@\S\s?')
	num_pattern = re.compile(r'[0-9]+')
	postcode_pattern = re.compile(r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})\|((GIR ?0A{2})\b$)\|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)\|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)')
	warning_pattern = re.compile('caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.')
	nbsp_pattern = re.compile(r' ')

	def stem_sentence(sentence):

	words = sentence.split()
	stemmed_words = [stemmer.stem(word).lower().rstrip("'") for word in words]
	return stemmed_words

	def stem_sentences(sentences, progress=gr.Progress()):
	"""Stem each sentence in a list of sentences."""
	stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)]
	return stemmed_sentences



	def get_lemma_text(text):
	# Tokenize the input string into words
	tokens = word_tokenize(text)

	lemmas = []
	for word in tokens:
	if len(word) > 3:
	lemma = wn.morphy(word)
	else:
	lemma = None

	if lemma is None:
	lemmas.append(word)
	else:
	lemmas.append(lemma)
	return lemmas

	def get_lemma_tokens(tokens):
	# Tokenize the input string into words

	lemmas = []
	for word in tokens:
	if len(word) > 3:
	lemma = wn.morphy(word)
	else:
	lemma = None

	if lemma is None:
	lemmas.append(word)
	else:
	lemmas.append(lemma)
	return lemmas

	def initial_clean(texts , progress=gr.Progress()):
	clean_texts = []

	i = 1
	#progress(0, desc="Cleaning texts")
	for text in progress.tqdm(texts, desc = "Cleaning data", unit = "rows"):
	#print("Cleaning row: ", i)
	text = re.sub(email_start_pattern, '', text)
	text = re.sub(email_end_pattern, '', text)
	text = re.sub(postcode_pattern, '', text)
	text = remove_hyphens(text)
	text = re.sub(html_pattern, '', text)
	text = re.sub(email_pattern, '', text)
	text = re.sub(nbsp_pattern, '', text)
	#text = re.sub(warning_pattern, '', text)
	#text = stem_sentence(text)
	text = get_lemma_text(text)
	text = ' '.join(text)
	# Uncomment the next line if you want to remove numbers as well
	# text = re.sub(num_pattern, '', text)
	clean_texts.append(text)

	i += 1
	return clean_texts

	# Sample execution
	#sample_texts = [
	# "Hello, this is a test email. kind regards, John",
	# "<div>Email content here</div> many thanks, Jane",
	# "caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.",
	# "john.doe123@example.com",
	# "Address: 1234 Elm St, AB12 3CD"
	#]

	#initial_clean(sample_texts)


	# +

	all_names = [x.lower() for x in list(nltk.corpus.names.words())]

	def remove_hyphens(text_text):
	return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)

	# tokenize text
	def tokenize_text(text_text):
	TOKEN_PATTERN = r'\s+'
	regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=True)
	word_tokens = regex_wt.tokenize(text_text)
	return word_tokens

	def remove_characters_after_tokenization(tokens):
	pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
	filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
	return filtered_tokens

	def convert_to_lowercase(tokens):
	return [token.lower() for token in tokens if token.isalpha()]

	def remove_stopwords(tokens, custom_stopwords):
	stopword_list = nltk.corpus.stopwords.words('english')
	stopword_list += my_stop_words
	filtered_tokens = [token for token in tokens if token not in stopword_list]
	return filtered_tokens

	def remove_names(tokens):
	stopword_list = list(nltk.corpus.names.words())
	stopword_list = [x.lower() for x in stopword_list]
	filtered_tokens = [token for token in tokens if token not in stopword_list]
	return filtered_tokens



	def remove_short_tokens(tokens):
	return [token for token in tokens if len(token) > 3]

	def keep_only_words_in_wordnet(tokens):
	return [token for token in tokens if wn.synsets(token)]

	def apply_lemmatize(tokens, wnl=WordNetLemmatizer()):

	def lem_word(word):

	if len(word) > 3: out_word = wnl.lemmatize(word)
	else: out_word = word

	return out_word

	return [lem_word(token) for token in tokens]


	# +
	### Do the cleaning

	def cleanTexttexts(texts):
	clean_texts = []
	for text in texts:
	#text = remove_email_start(text)
	#text = remove_email_end(text)
	text = remove_hyphens(text)
	text = cleanhtml(text)
	text = cleanemail(text)
	text = cleanpostcode(text)
	text = cleannum(text)
	#text = cleanwarning(text)
	text_i = tokenize_text(text)
	text_i = remove_characters_after_tokenization(text_i)
	#text_i = remove_names(text_i)
	text_i = convert_to_lowercase(text_i)
	#text_i = remove_stopwords(text_i, my_stop_words)
	text_i = get_lemma(text_i)
	#text_i = remove_short_tokens(text_i)
	text_i = keep_only_words_in_wordnet(text_i)

	text_i = apply_lemmatize(text_i)
	clean_texts.append(text_i)
	return clean_texts


	# -

	def remove_dups_text(data_samples_ready, data_samples_clean, data_samples):
	# Identify duplicates in the data: https://stackoverflow.com/questions/44191465/efficiently-identify-duplicates-in-large-list-500-000
	# Only identifies the second duplicate

	seen = set()
	dupes = []

	for i, doi in enumerate(data_samples_ready):
	if doi not in seen:
	seen.add(doi)
	else:
	dupes.append(i)
	#data_samples_ready[dupes[0:]]

	# To see a specific duplicated value you know the position of
	#matching = [s for s in data_samples_ready if data_samples_ready[83] in s]
	#matching

	# Remove duplicates only (keep first instance)
	#data_samples_ready = list( dict.fromkeys(data_samples_ready) ) # This way would keep one version of the duplicates

	### Remove all duplicates including original instance

	# Identify ALL duplicates including initial values
	# https://stackoverflow.com/questions/11236006/identify-duplicate-values-in-a-list-in-python

	from collections import defaultdict
	D = defaultdict(list)
	for i,item in enumerate(data_samples_ready):
	D[item].append(i)
	D = {k:v for k,v in D.items() if len(v)>1}

	# https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
	L = list(D.values())
	flat_list_dups = [item for sublist in L for item in sublist]

	# https://stackoverflow.com/questions/11303225/how-to-remove-multiple-indexes-from-a-list-at-the-same-time
	for index in sorted(flat_list_dups, reverse=True):
	del data_samples_ready[index]
	del data_samples_clean[index]
	del data_samples[index]

	# Remove blanks
	data_samples_ready = [i for i in data_samples_ready if i]
	data_samples_clean = [i for i in data_samples_clean if i]
	data_samples = [i for i in data_samples if i]

	return data_samples_ready, data_samples_clean, flat_list_dups, data_samples