Spaces:

ksvmuralidhar
/

uk_news_aggregator

Running

App Files Files Community

ksvmuralidhar commited on May 1

Commit

3e1ad25

•

1 Parent(s): ce359bc

Update word_cloud.py

Browse files

Files changed (1) hide show

word_cloud.py +17 -17

word_cloud.py CHANGED Viewed

@@ -503,7 +503,7 @@ class TextPreprocessor:
     def __remove_double_whitespaces(string: str):
         return " ".join(string.split())
-    def __remove_url(self, string_series: pd.Series):
         """
         Removes URLs m text
         :param string_series: pd.Series, input string series
@@ -514,7 +514,7 @@ class TextPreprocessor:
             repl=" ", regex=True).copy()
         return clean_string_series.map(self.__remove_double_whitespaces)
-    def __expand(self, string_series: pd.Series):
         """
         Replaces contractions with expansions. eg. don't wit do not.
         :param string_series: pd.Series, input string series
@@ -525,7 +525,7 @@ class TextPreprocessor:
             clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
         return clean_string_series.map(self.__remove_double_whitespaces)
-    def __remove_punct(self, string_series: pd.Series):
         """
        Removes punctuations from the input string.
        :param string_series: pd.Series, input string series
@@ -538,7 +538,7 @@ class TextPreprocessor:
             clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
         return clean_string_series.map(self.__remove_double_whitespaces)
-    def __remove_digits(self, string_series: pd.Series):
         """
        Removes digits from the input string.
        :param string_series: pd.Series, input string series
@@ -548,7 +548,7 @@ class TextPreprocessor:
         return clean_string_series.map(self.__remove_double_whitespaces)
     @staticmethod
-    def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
         """
         Reomves words/tokens where minlen <= len <= maxlen.
         :param string_series: pd.Series, input string series
@@ -560,7 +560,7 @@ class TextPreprocessor:
                                                                          (len(word) > maxlen) or (len(word) < minlen)]))
         return clean_string_series
-    def __remove_stop_words(self, string_series: pd.Series):
         """
        Removes stop words from the input string.
        :param string_series: pd.Series, input string series
@@ -572,7 +572,7 @@ class TextPreprocessor:
         return string_series.map(str_remove_stop_words)
-    def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
                                   bottom_p: int = None, dataset: str = 'train'):
         """
         Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
@@ -605,7 +605,7 @@ class TextPreprocessor:
                                                                              if word not in self.words_to_remove]))
             return clean_string_series
-    def preprocess(self, string_series: pd.Series, dataset: str = "train"):
         """
         Entry point.
         :param string_series: pd.Series, input string series
@@ -616,20 +616,20 @@ class TextPreprocessor:
         string_series = string_series.str.replace("<br>", " ")
         string_series = string_series.str.lower().copy()
         string_series = string_series.map(unidecode).copy()
-        string_series = self.__remove_url(string_series=string_series)
-        string_series = self.__expand(string_series=string_series)
         if self.remove_punct:
-            string_series = self.__remove_punct(string_series=string_series)
         if self.remove_digits:
-            string_series = self.__remove_digits(string_series=string_series)
         if self.remove_stop_words:
-            string_series = self.__remove_stop_words(string_series=string_series)
         if self.remove_short_words:
-            string_series = self.__remove_short_words(string_series=string_series,
                                                       minlen=self.minlen,
                                                       maxlen=self.maxlen)
-        string_series = self.__remove_top_bottom_words(string_series=string_series,
                                                        top_p=self.top_p,
                                                        bottom_p=self.bottom_p, dataset=dataset)
@@ -639,9 +639,9 @@ class TextPreprocessor:
         return string_series
-def get_frequent_words_html(df):
     text_preprocess = TextPreprocessor()
-    preprocessed_txt = text_preprocess.preprocess(df['title'] + ' ' + df['description'])
     counter = Counter(' '.join([*preprocessed_txt]).split())
     freq_tokens_html = '<div class="word-cloud-container">'

     def __remove_double_whitespaces(string: str):
         return " ".join(string.split())
+    async def __remove_url(self, string_series: pd.Series):
         """
         Removes URLs m text
         :param string_series: pd.Series, input string series
             repl=" ", regex=True).copy()
         return clean_string_series.map(self.__remove_double_whitespaces)
+    async def __expand(self, string_series: pd.Series):
         """
         Replaces contractions with expansions. eg. don't wit do not.
         :param string_series: pd.Series, input string series
             clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
         return clean_string_series.map(self.__remove_double_whitespaces)
+    async def __remove_punct(self, string_series: pd.Series):
         """
        Removes punctuations from the input string.
        :param string_series: pd.Series, input string series
             clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
         return clean_string_series.map(self.__remove_double_whitespaces)
+    async def __remove_digits(self, string_series: pd.Series):
         """
        Removes digits from the input string.
        :param string_series: pd.Series, input string series
         return clean_string_series.map(self.__remove_double_whitespaces)
     @staticmethod
+    async def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
         """
         Reomves words/tokens where minlen <= len <= maxlen.
         :param string_series: pd.Series, input string series
                                                                          (len(word) > maxlen) or (len(word) < minlen)]))
         return clean_string_series
+    async def __remove_stop_words(self, string_series: pd.Series):
         """
        Removes stop words from the input string.
        :param string_series: pd.Series, input string series
         return string_series.map(str_remove_stop_words)
+    async def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
                                   bottom_p: int = None, dataset: str = 'train'):
         """
         Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
                                                                              if word not in self.words_to_remove]))
             return clean_string_series
+    async def preprocess(self, string_series: pd.Series, dataset: str = "train"):
         """
         Entry point.
         :param string_series: pd.Series, input string series
         string_series = string_series.str.replace("<br>", " ")
         string_series = string_series.str.lower().copy()
         string_series = string_series.map(unidecode).copy()
+        string_series = await self.__remove_url(string_series=string_series)
+        string_series = await self.__expand(string_series=string_series)
         if self.remove_punct:
+            string_series = await self.__remove_punct(string_series=string_series)
         if self.remove_digits:
+            string_series = await self.__remove_digits(string_series=string_series)
         if self.remove_stop_words:
+            string_series = await self.__remove_stop_words(string_series=string_series)
         if self.remove_short_words:
+            string_series = await self.__remove_short_words(string_series=string_series,
                                                       minlen=self.minlen,
                                                       maxlen=self.maxlen)
+        string_series = await self.__remove_top_bottom_words(string_series=string_series,
                                                        top_p=self.top_p,
                                                        bottom_p=self.bottom_p, dataset=dataset)
         return string_series
+async def get_frequent_words_html(df):
     text_preprocess = TextPreprocessor()
+    preprocessed_txt = await text_preprocess.preprocess(df['title'] + ' ' + df['description'])
     counter = Counter(' '.join([*preprocessed_txt]).split())
     freq_tokens_html = '<div class="word-cloud-container">'