Spaces:
Sleeping
Sleeping
import os.path | |
from nltk import * | |
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS | |
from Pinpoint_Internal.Logger import * | |
# If NLTK data doesn't exist, downloads it | |
try: | |
tagged = pos_tag(["test"]) | |
except LookupError: | |
download() | |
# nltk.download() #todo how to get this to run once? | |
class sanitization(): | |
""" | |
This class is used to sanitize a given corpus of data. In turn removing stop words, stemming words, removing small | |
words, removing no alphabet words, and setting words to lower case. To save on repeat runs a local copy of the | |
serialised corpus is saved that is used unless this feature is overwritten. | |
""" | |
def sanitize(self, text, output_folder, force_new_data_and_dont_persisit=False): | |
""" | |
Entry function for sanitizing text | |
:param text: | |
:param force_new_data_and_dont_persisit: | |
:return: sanitized text | |
""" | |
sanitize_file_name = os.path.join(output_folder, "sanitized_text.txt") | |
final_text = "" | |
# If a file exists don't sanitize given text | |
if os.path.isfile(sanitize_file_name) and not force_new_data_and_dont_persisit: | |
logger.print_message("Sanitized file exists. Using data") | |
with open(sanitize_file_name, 'r', encoding="utf8") as file_to_write: | |
final_text = file_to_write.read() | |
else: | |
total_words = len(text.split(" ")) | |
number = 0 | |
logger.print_message("Starting sanitization... {} words to go".format(total_words)) | |
for word in text.split(" "): | |
number = number + 1 | |
word = self.remove_non_alpha(word) | |
word = self.lower(word) | |
word = self.stemmer(word) | |
word = self.remove_stop_words(word) | |
word = self.remove_small_words(word) | |
if word is None: | |
continue | |
final_text = final_text + word + " " | |
logger.print_message("Completed {} of {} sanitized words".format(number, total_words)) | |
final_text = final_text.replace(" ", " ") | |
if not force_new_data_and_dont_persisit: | |
with open(sanitize_file_name, 'w', encoding="utf8") as file_to_write: | |
file_to_write.write(final_text) | |
final_text = final_text.strip() | |
return final_text | |
def stemmer(self, word): | |
""" | |
Get stemms of words | |
:param word: | |
:return: the stemmed word using port stemmer | |
""" | |
porter = PorterStemmer() | |
# todo anouther stemmer be assessed? | |
# lancaster = LancasterStemmer() | |
# stemmed_word = lancaster.stem(word) | |
stemmed_word = porter.stem(word) | |
return stemmed_word | |
def lower(self, word): | |
""" | |
get the lower case representation of words | |
:param word: | |
:return: the lowercase representation of the word | |
""" | |
return word.lower() | |
def remove_stop_words(self, text): | |
""" | |
Remove stop words | |
:param text: | |
:return: the word without stop words | |
""" | |
text_without_stopwords = [word for word in text.split() if word not in ENGLISH_STOP_WORDS] | |
final_string = "" | |
for word in text_without_stopwords: | |
final_string = final_string + word + " " | |
return final_string | |
def remove_non_alpha(self, word): | |
""" | |
Removes non alphabet characters (Excluding spaces) | |
:param word: | |
:return: the word with non-alpha characters removed | |
""" | |
word = word.replace("\n", " ").replace("\t", " ").replace(" ", " ") | |
regex = re.compile('[^a-zA-Z ]') | |
return regex.sub('', word) | |
def remove_small_words(self, word, length_to_remove_if_not_equal=4): | |
""" | |
Removes words that are too small, defaults to words words length 3 characters or below which are removed. | |
:param word: | |
:param length_to_remove_if_not_equal: | |
:return: "" if word below 3 characters or the word if above | |
""" | |
new_word = "" | |
if len(word) >= length_to_remove_if_not_equal: | |
new_word = word | |
return new_word | |