seanpedrickcase's picture
Many changes to code organisation. More efficient searches from using intermediate outputs. Version 0.1
99d6fba
raw
history blame
No virus
5.71 kB
# ## Some functions to clean text
import re
import string
import polars as pl
# Add calendar months onto stop words
import calendar
#from tqdm import tqdm
import gradio as gr
# Adding custom words to the stopwords
custom_words = []
my_stop_words = custom_words
cal_month = (list(calendar.month_name))
cal_month = [x.lower() for x in cal_month]
# Remove blanks
cal_month = [i for i in cal_month if i]
#print(cal_month)
custom_words.extend(cal_month)
# #### Some of my cleaning functions
email_start_pattern_regex = r'.*importance:|.*subject:'
email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
email_pattern_regex = r'\S*@\S*\s?'
num_pattern_regex = r'[0-9]+'
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
nbsp_pattern_regex = r'&nbsp;'
# Pre-compiling the regular expressions for efficiency
email_start_pattern = re.compile(email_start_pattern_regex)
email_end_pattern = re.compile(email_end_pattern_regex)
html_pattern = re.compile(html_pattern_regex)
email_pattern = re.compile(email_end_pattern_regex)
num_pattern = re.compile(num_pattern_regex)
postcode_pattern = re.compile(postcode_pattern_regex)
warning_pattern = re.compile(warning_pattern_regex)
nbsp_pattern = re.compile(nbsp_pattern_regex)
# def stem_sentence(sentence):
# words = sentence.split()
# stemmed_words = [stemmer.stem(word).lower().rstrip("'") for word in words]
# return stemmed_words
# def stem_sentences(sentences, progress=gr.Progress()):
# """Stem each sentence in a list of sentences."""
# stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)]
# return stemmed_sentences
# def get_lemma_text(text):
# # Tokenize the input string into words
# tokens = word_tokenize(text)
# lemmas = []
# for word in tokens:
# if len(word) > 3:
# lemma = wn.morphy(word)
# else:
# lemma = None
# if lemma is None:
# lemmas.append(word)
# else:
# lemmas.append(lemma)
# return lemmas
# def get_lemma_tokens(tokens):
# Tokenize the input string into words
# lemmas = []
# for word in tokens:
# if len(word) > 3:
# lemma = wn.morphy(word)
# else:
# lemma = None
# if lemma is None:
# lemmas.append(word)
# else:
# lemmas.append(lemma)
# return lemmas
def initial_clean(texts , progress=gr.Progress()):
texts = pl.Series(texts)#[]
text = texts.str.replace_all(email_start_pattern_regex, '')
text = text.str.replace_all(email_end_pattern_regex, '')
text = text.str.replace_all(html_pattern_regex, '')
text = text.str.replace_all(email_pattern_regex, '')
text = text.to_list()
return text
def remove_hyphens(text_text):
return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
def remove_characters_after_tokenization(tokens):
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
return filtered_tokens
def convert_to_lowercase(tokens):
return [token.lower() for token in tokens if token.isalpha()]
def remove_short_tokens(tokens):
return [token for token in tokens if len(token) > 3]
def remove_dups_text(data_samples_ready, data_samples_clean, data_samples):
# Identify duplicates in the data: https://stackoverflow.com/questions/44191465/efficiently-identify-duplicates-in-large-list-500-000
# Only identifies the second duplicate
seen = set()
dups = []
for i, doi in enumerate(data_samples_ready):
if doi not in seen:
seen.add(doi)
else:
dups.append(i)
#data_samples_ready[dupes[0:]]
# To see a specific duplicated value you know the position of
#matching = [s for s in data_samples_ready if data_samples_ready[83] in s]
#matching
# Remove duplicates only (keep first instance)
#data_samples_ready = list( dict.fromkeys(data_samples_ready) ) # This way would keep one version of the duplicates
### Remove all duplicates including original instance
# Identify ALL duplicates including initial values
# https://stackoverflow.com/questions/11236006/identify-duplicate-values-in-a-list-in-python
from collections import defaultdict
D = defaultdict(list)
for i,item in enumerate(data_samples_ready):
D[item].append(i)
D = {k:v for k,v in D.items() if len(v)>1}
# https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
L = list(D.values())
flat_list_dups = [item for sublist in L for item in sublist]
# https://stackoverflow.com/questions/11303225/how-to-remove-multiple-indexes-from-a-list-at-the-same-time
for index in sorted(flat_list_dups, reverse=True):
del data_samples_ready[index]
del data_samples_clean[index]
del data_samples[index]
# Remove blanks
data_samples_ready = [i for i in data_samples_ready if i]
data_samples_clean = [i for i in data_samples_clean if i]
data_samples = [i for i in data_samples if i]
return data_samples_ready, data_samples_clean, flat_list_dups, data_samples