Spaces:
Running
Running
| import numpy as np | |
| import pandas as pd | |
| import string | |
| from unidecode import unidecode | |
| from collections import Counter | |
| class TextPreprocessor: | |
| def __init__(self, remove_punct: bool = True, remove_digits: bool = True, | |
| remove_stop_words: bool = True, | |
| remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None, | |
| bottom_p: float = None): | |
| self.remove_punct = remove_punct | |
| self.remove_digits = remove_digits | |
| self.remove_stop_words = remove_stop_words | |
| self.remove_short_words = remove_short_words | |
| self.minlen = minlen | |
| self.maxlen = maxlen | |
| self.top_p = top_p | |
| self.bottom_p = bottom_p | |
| self.words_to_remove = [] | |
| self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", | |
| 'about', | |
| 'above', | |
| 'across', | |
| 'after', | |
| 'afterwards', | |
| 'again', | |
| 'against', | |
| 'ain', | |
| 'all', | |
| 'almost', | |
| 'alone', | |
| 'along', | |
| 'already', | |
| 'also', | |
| 'although', | |
| 'always', | |
| 'am', | |
| 'among', | |
| 'amongst', | |
| 'amount', | |
| 'an', | |
| 'and', | |
| 'another', | |
| 'any', | |
| 'anyhow', | |
| 'anyone', | |
| 'anything', | |
| 'anyway', | |
| 'anywhere', | |
| 'are', | |
| 'around', | |
| 'as', | |
| 'at', | |
| 'back', | |
| 'be', | |
| 'became', | |
| 'because', | |
| 'become', | |
| 'becomes', | |
| 'becoming', | |
| 'been', | |
| 'before', | |
| 'beforehand', | |
| 'behind', | |
| 'being', | |
| 'below', | |
| 'beside', | |
| 'besides', | |
| 'between', | |
| 'beyond', | |
| 'both', | |
| 'bottom', | |
| 'but', | |
| 'by', | |
| 'ca', | |
| 'call', | |
| 'can', | |
| 'cannot', | |
| 'could', | |
| 'couldn', | |
| "couldn't", | |
| 'd', | |
| 'did', | |
| 'do', | |
| 'does', | |
| 'doing', | |
| 'done', | |
| 'down', | |
| 'due', | |
| 'during', | |
| 'each', | |
| 'eight', | |
| 'either', | |
| 'eleven', | |
| 'else', | |
| 'elsewhere', | |
| 'empty', | |
| 'enough', | |
| 'even', | |
| 'ever', | |
| 'every', | |
| 'everyone', | |
| 'everything', | |
| 'everywhere', | |
| 'except', | |
| 'few', | |
| 'fifteen', | |
| 'fifty', | |
| 'first', | |
| 'five', | |
| 'for', | |
| 'former', | |
| 'formerly', | |
| 'forty', | |
| 'four', | |
| 'from', | |
| 'front', | |
| 'full', | |
| 'further', | |
| 'get', | |
| 'give', | |
| 'go', | |
| 'had', | |
| 'has', | |
| 'have', | |
| 'having', | |
| 'he', | |
| 'hence', | |
| 'her', | |
| 'here', | |
| 'hereafter', | |
| 'hereby', | |
| 'herein', | |
| 'hereupon', | |
| 'hers', | |
| 'herself', | |
| 'him', | |
| 'himself', | |
| 'his', | |
| 'how', | |
| 'however', | |
| 'hundred', | |
| 'i', | |
| 'if', | |
| 'in', | |
| 'indeed', | |
| 'into', | |
| 'is', | |
| 'it', | |
| "it's", | |
| 'its', | |
| 'itself', | |
| 'just', | |
| 'keep', | |
| 'last', | |
| 'latter', | |
| 'latterly', | |
| 'least', | |
| 'less', | |
| 'll', | |
| 'm', | |
| 'ma', | |
| 'made', | |
| 'make', | |
| 'many', | |
| 'say', | |
| 'said', | |
| 'says', | |
| 'told', | |
| 'tell', | |
| 'may', | |
| 'me', | |
| 'meanwhile', | |
| 'might', | |
| 'mine', | |
| 'more', | |
| 'moreover', | |
| 'most', | |
| 'mostly', | |
| 'move', | |
| 'much', | |
| 'must', | |
| 'my', | |
| 'myself', | |
| 'name', | |
| 'namely', | |
| 'neither', | |
| 'never', | |
| 'nevertheless', | |
| 'next', | |
| 'nine', | |
| 'no', | |
| 'nobody', | |
| 'none', | |
| 'noone', | |
| 'nor', | |
| 'not', | |
| 'nothing', | |
| 'now', | |
| 'nowhere', | |
| 'o', | |
| 'of', | |
| 'off', | |
| 'often', | |
| 'on', | |
| 'once', | |
| 'one', | |
| 'only', | |
| 'onto', | |
| 'or', | |
| 'other', | |
| 'others', | |
| 'otherwise', | |
| 'our', | |
| 'ours', | |
| 'ourselves', | |
| 'out', | |
| 'over', | |
| 'own', | |
| 'part', | |
| 'per', | |
| 'perhaps', | |
| 'please', | |
| 'put', | |
| 'quite', | |
| 'rather', | |
| 're', | |
| 'rs', | |
| 'really', | |
| 'regarding', | |
| 's', | |
| 'same', | |
| 'say', | |
| 'see', | |
| 'seem', | |
| 'seemed', | |
| 'seeming', | |
| 'seems', | |
| 'serious', | |
| 'several', | |
| 'shan', | |
| "shan't", | |
| 'she', | |
| "she's", | |
| 'should', | |
| "should've", | |
| 'shouldn', | |
| "shouldn't", | |
| 'show', | |
| 'side', | |
| 'since', | |
| 'six', | |
| 'sixty', | |
| 'so', | |
| 'some', | |
| 'somehow', | |
| 'someone', | |
| 'something', | |
| 'sometime', | |
| 'sometimes', | |
| 'somewhere', | |
| 'still', | |
| 'such', | |
| 't', | |
| 'take', | |
| 'ten', | |
| 'than', | |
| 'that', | |
| "that'll", | |
| 'the', | |
| 'their', | |
| 'theirs', | |
| 'them', | |
| 'themselves', | |
| 'then', | |
| 'thence', | |
| 'there', | |
| 'thereafter', | |
| 'thereby', | |
| 'therefore', | |
| 'therein', | |
| 'thereupon', | |
| 'these', | |
| 'they', | |
| 'third', | |
| 'this', | |
| 'those', | |
| 'though', | |
| 'three', | |
| 'through', | |
| 'throughout', | |
| 'thru', | |
| 'thus', | |
| 'to', | |
| 'together', | |
| 'too', | |
| 'top', | |
| 'toward', | |
| 'towards', | |
| 'twelve', | |
| 'twenty', | |
| 'two', | |
| 'under', | |
| 'unless', | |
| 'until', | |
| 'up', | |
| 'upon', | |
| 'us', | |
| 'used', | |
| 'using', | |
| 'various', | |
| 've', | |
| 'very', | |
| 'via', | |
| 'was', | |
| 'wasn', | |
| "wasn't", | |
| 'we', | |
| 'well', | |
| 'were', | |
| 'weren', | |
| "weren't", | |
| 'what', | |
| 'whatever', | |
| 'when', | |
| 'whence', | |
| 'whenever', | |
| 'where', | |
| 'whereafter', | |
| 'whereas', | |
| 'whereby', | |
| 'wherein', | |
| 'whereupon', | |
| 'wherever', | |
| 'whether', | |
| 'which', | |
| 'while', | |
| 'whither', | |
| 'who', | |
| 'whoever', | |
| 'whole', | |
| 'whom', | |
| 'whose', | |
| 'why', | |
| 'will', | |
| 'with', | |
| 'within', | |
| 'without', | |
| 'won', | |
| "won't", | |
| 'would', | |
| 'wouldn', | |
| "wouldn't", | |
| 'y', | |
| 'yet', | |
| 'you', | |
| "you'd", | |
| "you'll", | |
| "you're", | |
| "you've", | |
| 'your', | |
| 'yours', | |
| 'yourself', | |
| 'yourselves', | |
| '‘d', | |
| '‘ll', | |
| '‘m', | |
| '‘re', | |
| '‘s', | |
| '‘ve', | |
| '’d', | |
| '’ll', | |
| '’m', | |
| '’re', | |
| 'new', | |
| 'old', | |
| '’s', | |
| '’ve'] | |
| self.contraction_to_expansion = {"ain't": "am not", | |
| "aren't": "are not", | |
| "can't": "cannot", | |
| "can't've": "cannot have", | |
| "'cause": "because", | |
| "could've": "could have", | |
| "couldn't": "could not", | |
| "couldn't've": "could not have", | |
| "didn't": "did not", | |
| "doesn't": "does not", | |
| "don't": "do not", | |
| "hadn't": "had not", | |
| "hadn't've": "had not have", | |
| "hasn't": "has not", | |
| "haven't": "have not", | |
| "he'd": "he would", | |
| "he'd've": "he would have", | |
| "he'll": "he will", | |
| "he'll've": "he will have", | |
| "he's": "he is", | |
| "how'd": "how did", | |
| "how'd'y": "how do you", | |
| "how'll": "how will", | |
| "how's": "how is", | |
| "i'd": "i would", | |
| "i'd've": "i would have", | |
| "i'll": "i will", | |
| "i'll've": "i will have", | |
| "i'm": "i am", | |
| "i've": "i have", | |
| "isn't": "is not", | |
| "it'd": "it had", | |
| "it'd've": "it would have", | |
| "it'll": "it will", | |
| "it'll've": "it will have", | |
| "it's": "it is", | |
| "let's": "let us", | |
| "ma'am": "madam", | |
| "mayn't": "may not", | |
| "might've": "might have", | |
| "mightn't": "might not", | |
| "mightn't've": "might not have", | |
| "must've": "must have", | |
| "mustn't": "must not", | |
| "mustn't've": "must not have", | |
| "needn't": "need not", | |
| "needn't've": "need not have", | |
| "o'clock": "of the clock", | |
| "oughtn't": "ought not", | |
| "oughtn't've": "ought not have", | |
| "shan't": "shall not", | |
| "sha'n't": "shall not", | |
| "shan't've": "shall not have", | |
| "she'd": "she would", | |
| "she'd've": "she would have", | |
| "she'll": "she will", | |
| "she'll've": "she will have", | |
| "she's": "she is", | |
| "should've": "should have", | |
| "shouldn't": "should not", | |
| "shouldn't've": "should not have", | |
| "so've": "so have", | |
| "so's": "so is", | |
| "that'd": "that would", | |
| "that'd've": "that would have", | |
| "that's": "that is", | |
| "there'd": "there had", | |
| "there'd've": "there would have", | |
| "there's": "there is", | |
| "they'd": "they would", | |
| "they'd've": "they would have", | |
| "they'll": "they will", | |
| "they'll've": "they will have", | |
| "they're": "they are", | |
| "they've": "they have", | |
| "to've": "to have", | |
| "wasn't": "was not", | |
| "we'd": "we had", | |
| "we'd've": "we would have", | |
| "we'll": "we will", | |
| "we'll've": "we will have", | |
| "we're": "we are", | |
| "we've": "we have", | |
| "weren't": "were not", | |
| "what'll": "what will", | |
| "what'll've": "what will have", | |
| "what're": "what are", | |
| "what's": "what is", | |
| "what've": "what have", | |
| "when's": "when is", | |
| "when've": "when have", | |
| "where'd": "where did", | |
| "where's": "where is", | |
| "where've": "where have", | |
| "who'll": "who will", | |
| "who'll've": "who will have", | |
| "who's": "who is", | |
| "who've": "who have", | |
| "why's": "why is", | |
| "why've": "why have", | |
| "will've": "will have", | |
| "won't": "will not", | |
| "won't've": "will not have", | |
| "would've": "would have", | |
| "wouldn't": "would not", | |
| "wouldn't've": "would not have", | |
| "y'all": "you all", | |
| "y'alls": "you alls", | |
| "y'all'd": "you all would", | |
| "y'all'd've": "you all would have", | |
| "y'all're": "you all are", | |
| "y'all've": "you all have", | |
| "you'd": "you had", | |
| "you'd've": "you would have", | |
| "you'll": "you you will", | |
| "you'll've": "you you will have", | |
| "you're": "you are", | |
| "you've": "you have" | |
| } | |
| def __remove_double_whitespaces(string: str): | |
| return " ".join(string.split()) | |
| async def __remove_url(self, string_series: pd.Series): | |
| """ | |
| Removes URLs m text | |
| :param string_series: pd.Series, input string series | |
| :return: pd.Series, cleaned string series | |
| """ | |
| clean_string_series = string_series.str.replace( | |
| pat=r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})", | |
| repl=" ", regex=True).copy() | |
| return clean_string_series.map(self.__remove_double_whitespaces) | |
| async def __expand(self, string_series: pd.Series): | |
| """ | |
| Replaces contractions with expansions. eg. don't wit do not. | |
| :param string_series: pd.Series, input string series | |
| :return: pd.Series, cleaned string series | |
| """ | |
| clean_string_series = string_series.copy() | |
| for c, e in self.contraction_to_expansion.items(): | |
| clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy() | |
| return clean_string_series.map(self.__remove_double_whitespaces) | |
| async def __remove_punct(self, string_series: pd.Series): | |
| """ | |
| Removes punctuations from the input string. | |
| :param string_series: pd.Series, input string series | |
| :return: pd.Series, cleaned string series | |
| """ | |
| clean_string_series = string_series.copy() | |
| puncts = [r'\n', r'\r', r'\t'] | |
| puncts.extend(list(string.punctuation)) | |
| for i in puncts: | |
| clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy() | |
| return clean_string_series.map(self.__remove_double_whitespaces) | |
| async def __remove_digits(self, string_series: pd.Series): | |
| """ | |
| Removes digits from the input string. | |
| :param string_series: pd.Series, input string series | |
| :return: pd.Series, cleaned string series | |
| """ | |
| clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy() | |
| return clean_string_series.map(self.__remove_double_whitespaces) | |
| async def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1): | |
| """ | |
| Reomves words/tokens where minlen <= len <= maxlen. | |
| :param string_series: pd.Series, input string series | |
| :param minlen: int, minimum length of token to be removed. | |
| :param maxlen: int, maximum length of token to be removed. | |
| :return: pd.Series, cleaned string series | |
| """ | |
| clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if | |
| (len(word) > maxlen) or (len(word) < minlen)])) | |
| return clean_string_series | |
| async def __remove_stop_words(self, string_series: pd.Series): | |
| """ | |
| Removes stop words from the input string. | |
| :param string_series: pd.Series, input string series | |
| :return: pd.Series, cleaned string series | |
| """ | |
| def str_remove_stop_words(string: str): | |
| stops = self.stop_words | |
| return " ".join([token for token in string.split() if token not in stops]) | |
| return string_series.map(str_remove_stop_words) | |
| async def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None, | |
| bottom_p: int = None, dataset: str = 'train'): | |
| """ | |
| Reomoves top_p percent (frequent) words and bottom_p percent (rare) words. | |
| :param string_series: pd.Series, input string series | |
| :param top_p: float, percent of frequent words to remove. | |
| :param bottom_p: float, percent of rare words to remove. | |
| :param dataset: str, "train" for training set, "tesrt" for val/dev/test set. | |
| :return: pd.Series, cleaned string series | |
| """ | |
| if dataset == 'train': | |
| if top_p is None: | |
| top_p = 0 | |
| if bottom_p is None: | |
| bottom_p = 0 | |
| if top_p > 0 or bottom_p > 0: | |
| word_freq = pd.Series(" ".join(string_series).split()).value_counts() | |
| n_words = len(word_freq) | |
| if top_p > 0: | |
| self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]]) | |
| if bottom_p > 0: | |
| self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]]) | |
| if len(self.words_to_remove) == 0: | |
| return string_series | |
| else: | |
| clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() | |
| if word not in self.words_to_remove])) | |
| return clean_string_series | |
| async def preprocess(self, string_series: pd.Series, dataset: str = "train"): | |
| """ | |
| Entry point. | |
| :param string_series: pd.Series, input string series | |
| :param dataset: str, "train" for training set, "tesrt" for val/dev/test set. | |
| :return: pd.Series, cleaned string series | |
| """ | |
| string_series = string_series.str.lower().copy() | |
| string_series = string_series.map(unidecode).copy() | |
| string_series = await self.__remove_url(string_series=string_series) | |
| string_series = await self.__expand(string_series=string_series) | |
| if self.remove_punct: | |
| string_series = await self.__remove_punct(string_series=string_series) | |
| if self.remove_digits: | |
| string_series = await self.__remove_digits(string_series=string_series) | |
| if self.remove_stop_words: | |
| string_series = await self.__remove_stop_words(string_series=string_series) | |
| if self.remove_short_words: | |
| string_series = await self.__remove_short_words(string_series=string_series, | |
| minlen=self.minlen, | |
| maxlen=self.maxlen) | |
| string_series = await self.__remove_top_bottom_words(string_series=string_series, | |
| top_p=self.top_p, | |
| bottom_p=self.bottom_p, dataset=dataset) | |
| string_series = string_series.str.strip().copy() | |
| string_series.replace(to_replace="", value="this is an empty message", inplace=True) | |
| return string_series | |
| async def get_frequent_words_html(df): | |
| text_preprocess = TextPreprocessor() | |
| preprocessed_txt = await text_preprocess.preprocess(df['title'] + ' ' + df['description']) | |
| counter = Counter(' '.join([*preprocessed_txt]).split()) | |
| freq_tokens_html = '<div class="word-cloud-container">' | |
| n = 1 | |
| for i, j in counter.most_common(25): | |
| freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{" " * np.random.randint(3, 7, 1)[0]}' | |
| if n == 5: | |
| freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">' | |
| n += 1 | |
| freq_tokens_html += '</div></div>' | |
| return freq_tokens_html |