Spaces:

ksvmuralidhar
/

news_aggregator

Running

File size: 19,435 Bytes

078c1e1

import numpy as np
import pandas as pd
import string
from unidecode import unidecode
from collections import Counter


class TextPreprocessor:
    def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
                 remove_stop_words: bool = True,
                 remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,
                 bottom_p: float = None):
        self.remove_punct = remove_punct
        self.remove_digits = remove_digits
        self.remove_stop_words = remove_stop_words
        self.remove_short_words = remove_short_words
        self.minlen = minlen
        self.maxlen = maxlen
        self.top_p = top_p
        self.bottom_p = bottom_p
        self.words_to_remove = []
        self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'ain',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'couldn',
 "couldn't",
 'd',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'go',
 'had',
 'has',
 'have',
 'having',
 'he',
 'hence',
 'her',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'hundred',
 'i',
 'if',
 'in',
 'indeed',
 'into',
 'is',
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'keep',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 'll',
 'm',
 'ma',
 'made',
 'make',
 'many',
'say',
'said',
'says',
'told',
'tell',
 'may',
 'me',
 'meanwhile',
 'might',
 'mine',
 'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 'my',
 'myself',
 'name',
 'namely',
 'neither',
 'never',
 'nevertheless',
 'next',
 'nine',
 'no',
 'nobody',
 'none',
 'noone',
 'nor',
 'not',
 'nothing',
 'now',
 'nowhere',
 'o',
 'of',
 'off',
 'often',
 'on',
 'once',
 'one',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'part',
 'per',
 'perhaps',
 'please',
 'put',
 'quite',
 'rather',
 're',
'rs',
 'really',
 'regarding',
 's',
 'same',
 'say',
 'see',
 'seem',
 'seemed',
 'seeming',
 'seems',
 'serious',
 'several',
 'shan',
 "shan't",
 'she',
 "she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'show',
 'side',
 'since',
 'six',
 'sixty',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 't',
 'take',
 'ten',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'they',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'top',
 'toward',
 'towards',
 'twelve',
 'twenty',
 'two',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 'used',
 'using',
 'various',
 've',
 'very',
 'via',
 'was',
 'wasn',
 "wasn't",
 'we',
 'well',
 'were',
 'weren',
 "weren't",
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'with',
 'within',
 'without',
 'won',
 "won't",
 'would',
 'wouldn',
 "wouldn't",
 'y',
 'yet',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves',
 '‘d',
 '‘ll',
 '‘m',
 '‘re',
 '‘s',
 '‘ve',
 '’d',
 '’ll',
 '’m',
 '’re',
'new',
'old',
 '’s',
 '’ve']

        self.contraction_to_expansion = {"ain't": "am not",
                                         "aren't": "are not",
                                         "can't": "cannot",
                                         "can't've": "cannot have",
                                         "'cause": "because",
                                         "could've": "could have",
                                         "couldn't": "could not",
                                         "couldn't've": "could not have",
                                         "didn't": "did not",
                                         "doesn't": "does not",
                                         "don't": "do not",
                                         "hadn't": "had not",
                                         "hadn't've": "had not have",
                                         "hasn't": "has not",
                                         "haven't": "have not",
                                         "he'd": "he would",
                                         "he'd've": "he would have",
                                         "he'll": "he will",
                                         "he'll've": "he will have",
                                         "he's": "he is",
                                         "how'd": "how did",
                                         "how'd'y": "how do you",
                                         "how'll": "how will",
                                         "how's": "how is",
                                         "i'd": "i would",
                                         "i'd've": "i would have",
                                         "i'll": "i will",
                                         "i'll've": "i will have",
                                         "i'm": "i am",
                                         "i've": "i have",
                                         "isn't": "is not",
                                         "it'd": "it had",
                                         "it'd've": "it would have",
                                         "it'll": "it will",
                                         "it'll've": "it will have",
                                         "it's": "it is",
                                         "let's": "let us",
                                         "ma'am": "madam",
                                         "mayn't": "may not",
                                         "might've": "might have",
                                         "mightn't": "might not",
                                         "mightn't've": "might not have",
                                         "must've": "must have",
                                         "mustn't": "must not",
                                         "mustn't've": "must not have",
                                         "needn't": "need not",
                                         "needn't've": "need not have",
                                         "o'clock": "of the clock",
                                         "oughtn't": "ought not",
                                         "oughtn't've": "ought not have",
                                         "shan't": "shall not",
                                         "sha'n't": "shall not",
                                         "shan't've": "shall not have",
                                         "she'd": "she would",
                                         "she'd've": "she would have",
                                         "she'll": "she will",
                                         "she'll've": "she will have",
                                         "she's": "she is",
                                         "should've": "should have",
                                         "shouldn't": "should not",
                                         "shouldn't've": "should not have",
                                         "so've": "so have",
                                         "so's": "so is",
                                         "that'd": "that would",
                                         "that'd've": "that would have",
                                         "that's": "that is",
                                         "there'd": "there had",
                                         "there'd've": "there would have",
                                         "there's": "there is",
                                         "they'd": "they would",
                                         "they'd've": "they would have",
                                         "they'll": "they will",
                                         "they'll've": "they will have",
                                         "they're": "they are",
                                         "they've": "they have",
                                         "to've": "to have",
                                         "wasn't": "was not",
                                         "we'd": "we had",
                                         "we'd've": "we would have",
                                         "we'll": "we will",
                                         "we'll've": "we will have",
                                         "we're": "we are",
                                         "we've": "we have",
                                         "weren't": "were not",
                                         "what'll": "what will",
                                         "what'll've": "what will have",
                                         "what're": "what are",
                                         "what's": "what is",
                                         "what've": "what have",
                                         "when's": "when is",
                                         "when've": "when have",
                                         "where'd": "where did",
                                         "where's": "where is",
                                         "where've": "where have",
                                         "who'll": "who will",
                                         "who'll've": "who will have",
                                         "who's": "who is",
                                         "who've": "who have",
                                         "why's": "why is",
                                         "why've": "why have",
                                         "will've": "will have",
                                         "won't": "will not",
                                         "won't've": "will not have",
                                         "would've": "would have",
                                         "wouldn't": "would not",
                                         "wouldn't've": "would not have",
                                         "y'all": "you all",
                                         "y'alls": "you alls",
                                         "y'all'd": "you all would",
                                         "y'all'd've": "you all would have",
                                         "y'all're": "you all are",
                                         "y'all've": "you all have",
                                         "you'd": "you had",
                                         "you'd've": "you would have",
                                         "you'll": "you you will",
                                         "you'll've": "you you will have",
                                         "you're": "you are",
                                         "you've": "you have"
                                         }

    @staticmethod
    def __remove_double_whitespaces(string: str):
        return " ".join(string.split())

    def __remove_url(self, string_series: pd.Series):
        """
        Removes URLs m text
        :param string_series: pd.Series, input string series
        :return: pd.Series, cleaned string series
        """
        clean_string_series = string_series.str.replace(
            pat=r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})",
            repl=" ", regex=True).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)

    def __expand(self, string_series: pd.Series):
        """
        Replaces contractions with expansions. eg. don't wit do not.
        :param string_series: pd.Series, input string series
        :return: pd.Series, cleaned string series
        """
        clean_string_series = string_series.copy()
        for c, e in self.contraction_to_expansion.items():
            clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)

    def __remove_punct(self, string_series: pd.Series):
        """
       Removes punctuations from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        clean_string_series = string_series.copy()
        puncts = [r'\n', r'\r', r'\t']
        puncts.extend(list(string.punctuation))
        for i in puncts:
            clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)

    def __remove_digits(self, string_series: pd.Series):
        """
       Removes digits from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)

    @staticmethod
    def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
        """
        Reomves words/tokens where minlen <= len <= maxlen.
        :param string_series: pd.Series, input string series
        :param minlen: int, minimum length of token to be removed.
        :param maxlen:  int, maximum length of token to be removed.
        :return: pd.Series, cleaned string series
        """
        clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if
                                                                         (len(word) > maxlen) or (len(word) < minlen)]))
        return clean_string_series

    def __remove_stop_words(self, string_series: pd.Series):
        """
       Removes stop words from the input string.
       :param string_series: pd.Series, input string series
       :return: pd.Series, cleaned string series
       """
        def str_remove_stop_words(string: str):
            stops = self.stop_words
            return " ".join([token for token in string.split() if token not in stops])

        return string_series.map(str_remove_stop_words)

    def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
                                  bottom_p: int = None, dataset: str = 'train'):
        """
        Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
        :param string_series: pd.Series, input string series
        :param top_p: float, percent of frequent words to remove.
        :param bottom_p: float, percent of rare words to remove.
        :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
        :return: pd.Series, cleaned string series
        """
        if dataset == 'train':
            if top_p is None:
                top_p = 0
            if bottom_p is None:
                bottom_p = 0

            if top_p > 0 or bottom_p > 0:
                word_freq = pd.Series(" ".join(string_series).split()).value_counts()
                n_words = len(word_freq)

            if top_p > 0:
                self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])

            if bottom_p > 0:
                self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])

        if len(self.words_to_remove) == 0:
            return string_series
        else:
            clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split()
                                                                             if word not in self.words_to_remove]))
            return clean_string_series

    def preprocess(self, string_series: pd.Series, dataset: str = "train"):
        """
        Entry point.
        :param string_series: pd.Series, input string series
        :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
        :return: pd.Series, cleaned string series
        """
        string_series = string_series.str.lower().copy()
        string_series = string_series.map(unidecode).copy()
        string_series = self.__remove_url(string_series=string_series)
        string_series = self.__expand(string_series=string_series)

        if self.remove_punct:
            string_series = self.__remove_punct(string_series=string_series)
        if self.remove_digits:
            string_series = self.__remove_digits(string_series=string_series)
        if self.remove_stop_words:
            string_series = self.__remove_stop_words(string_series=string_series)
        if self.remove_short_words:
            string_series = self.__remove_short_words(string_series=string_series,
                                                      minlen=self.minlen,
                                                      maxlen=self.maxlen)
        string_series = self.__remove_top_bottom_words(string_series=string_series,
                                                       top_p=self.top_p,
                                                       bottom_p=self.bottom_p, dataset=dataset)

        string_series = string_series.str.strip().copy()
        string_series.replace(to_replace="", value="this is an empty message", inplace=True)

        return string_series


def get_frequent_words_html(df):
    text_preprocess = TextPreprocessor()
    preprocessed_txt = text_preprocess.preprocess(df['title'] + ' ' + df['description'])
    counter = Counter(' '.join([*preprocessed_txt]).split())
    
    freq_tokens_html = '<div class="word-cloud-container">'
    n = 1
    for i, j in counter.most_common(25):
        freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{"&nbsp;" * np.random.randint(3, 7, 1)[0]}'
        if n == 5:
            freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">'
        n += 1
    freq_tokens_html += '</div></div>'
    return freq_tokens_html