ksvmuralidhar commited on
Commit
3e1ad25
1 Parent(s): ce359bc

Update word_cloud.py

Browse files
Files changed (1) hide show
  1. word_cloud.py +17 -17
word_cloud.py CHANGED
@@ -503,7 +503,7 @@ class TextPreprocessor:
503
  def __remove_double_whitespaces(string: str):
504
  return " ".join(string.split())
505
 
506
- def __remove_url(self, string_series: pd.Series):
507
  """
508
  Removes URLs m text
509
  :param string_series: pd.Series, input string series
@@ -514,7 +514,7 @@ class TextPreprocessor:
514
  repl=" ", regex=True).copy()
515
  return clean_string_series.map(self.__remove_double_whitespaces)
516
 
517
- def __expand(self, string_series: pd.Series):
518
  """
519
  Replaces contractions with expansions. eg. don't wit do not.
520
  :param string_series: pd.Series, input string series
@@ -525,7 +525,7 @@ class TextPreprocessor:
525
  clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
526
  return clean_string_series.map(self.__remove_double_whitespaces)
527
 
528
- def __remove_punct(self, string_series: pd.Series):
529
  """
530
  Removes punctuations from the input string.
531
  :param string_series: pd.Series, input string series
@@ -538,7 +538,7 @@ class TextPreprocessor:
538
  clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
539
  return clean_string_series.map(self.__remove_double_whitespaces)
540
 
541
- def __remove_digits(self, string_series: pd.Series):
542
  """
543
  Removes digits from the input string.
544
  :param string_series: pd.Series, input string series
@@ -548,7 +548,7 @@ class TextPreprocessor:
548
  return clean_string_series.map(self.__remove_double_whitespaces)
549
 
550
  @staticmethod
551
- def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
552
  """
553
  Reomves words/tokens where minlen <= len <= maxlen.
554
  :param string_series: pd.Series, input string series
@@ -560,7 +560,7 @@ class TextPreprocessor:
560
  (len(word) > maxlen) or (len(word) < minlen)]))
561
  return clean_string_series
562
 
563
- def __remove_stop_words(self, string_series: pd.Series):
564
  """
565
  Removes stop words from the input string.
566
  :param string_series: pd.Series, input string series
@@ -572,7 +572,7 @@ class TextPreprocessor:
572
 
573
  return string_series.map(str_remove_stop_words)
574
 
575
- def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
576
  bottom_p: int = None, dataset: str = 'train'):
577
  """
578
  Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
@@ -605,7 +605,7 @@ class TextPreprocessor:
605
  if word not in self.words_to_remove]))
606
  return clean_string_series
607
 
608
- def preprocess(self, string_series: pd.Series, dataset: str = "train"):
609
  """
610
  Entry point.
611
  :param string_series: pd.Series, input string series
@@ -616,20 +616,20 @@ class TextPreprocessor:
616
  string_series = string_series.str.replace("<br>", " ")
617
  string_series = string_series.str.lower().copy()
618
  string_series = string_series.map(unidecode).copy()
619
- string_series = self.__remove_url(string_series=string_series)
620
- string_series = self.__expand(string_series=string_series)
621
 
622
  if self.remove_punct:
623
- string_series = self.__remove_punct(string_series=string_series)
624
  if self.remove_digits:
625
- string_series = self.__remove_digits(string_series=string_series)
626
  if self.remove_stop_words:
627
- string_series = self.__remove_stop_words(string_series=string_series)
628
  if self.remove_short_words:
629
- string_series = self.__remove_short_words(string_series=string_series,
630
  minlen=self.minlen,
631
  maxlen=self.maxlen)
632
- string_series = self.__remove_top_bottom_words(string_series=string_series,
633
  top_p=self.top_p,
634
  bottom_p=self.bottom_p, dataset=dataset)
635
 
@@ -639,9 +639,9 @@ class TextPreprocessor:
639
  return string_series
640
 
641
 
642
- def get_frequent_words_html(df):
643
  text_preprocess = TextPreprocessor()
644
- preprocessed_txt = text_preprocess.preprocess(df['title'] + ' ' + df['description'])
645
  counter = Counter(' '.join([*preprocessed_txt]).split())
646
 
647
  freq_tokens_html = '<div class="word-cloud-container">'
 
503
  def __remove_double_whitespaces(string: str):
504
  return " ".join(string.split())
505
 
506
+ async def __remove_url(self, string_series: pd.Series):
507
  """
508
  Removes URLs m text
509
  :param string_series: pd.Series, input string series
 
514
  repl=" ", regex=True).copy()
515
  return clean_string_series.map(self.__remove_double_whitespaces)
516
 
517
+ async def __expand(self, string_series: pd.Series):
518
  """
519
  Replaces contractions with expansions. eg. don't wit do not.
520
  :param string_series: pd.Series, input string series
 
525
  clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
526
  return clean_string_series.map(self.__remove_double_whitespaces)
527
 
528
+ async def __remove_punct(self, string_series: pd.Series):
529
  """
530
  Removes punctuations from the input string.
531
  :param string_series: pd.Series, input string series
 
538
  clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
539
  return clean_string_series.map(self.__remove_double_whitespaces)
540
 
541
+ async def __remove_digits(self, string_series: pd.Series):
542
  """
543
  Removes digits from the input string.
544
  :param string_series: pd.Series, input string series
 
548
  return clean_string_series.map(self.__remove_double_whitespaces)
549
 
550
  @staticmethod
551
+ async def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
552
  """
553
  Reomves words/tokens where minlen <= len <= maxlen.
554
  :param string_series: pd.Series, input string series
 
560
  (len(word) > maxlen) or (len(word) < minlen)]))
561
  return clean_string_series
562
 
563
+ async def __remove_stop_words(self, string_series: pd.Series):
564
  """
565
  Removes stop words from the input string.
566
  :param string_series: pd.Series, input string series
 
572
 
573
  return string_series.map(str_remove_stop_words)
574
 
575
+ async def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
576
  bottom_p: int = None, dataset: str = 'train'):
577
  """
578
  Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
 
605
  if word not in self.words_to_remove]))
606
  return clean_string_series
607
 
608
+ async def preprocess(self, string_series: pd.Series, dataset: str = "train"):
609
  """
610
  Entry point.
611
  :param string_series: pd.Series, input string series
 
616
  string_series = string_series.str.replace("<br>", " ")
617
  string_series = string_series.str.lower().copy()
618
  string_series = string_series.map(unidecode).copy()
619
+ string_series = await self.__remove_url(string_series=string_series)
620
+ string_series = await self.__expand(string_series=string_series)
621
 
622
  if self.remove_punct:
623
+ string_series = await self.__remove_punct(string_series=string_series)
624
  if self.remove_digits:
625
+ string_series = await self.__remove_digits(string_series=string_series)
626
  if self.remove_stop_words:
627
+ string_series = await self.__remove_stop_words(string_series=string_series)
628
  if self.remove_short_words:
629
+ string_series = await self.__remove_short_words(string_series=string_series,
630
  minlen=self.minlen,
631
  maxlen=self.maxlen)
632
+ string_series = await self.__remove_top_bottom_words(string_series=string_series,
633
  top_p=self.top_p,
634
  bottom_p=self.bottom_p, dataset=dataset)
635
 
 
639
  return string_series
640
 
641
 
642
+ async def get_frequent_words_html(df):
643
  text_preprocess = TextPreprocessor()
644
+ preprocessed_txt = await text_preprocess.preprocess(df['title'] + ' ' + df['description'])
645
  counter = Counter(' '.join([*preprocessed_txt]).split())
646
 
647
  freq_tokens_html = '<div class="word-cloud-container">'