document-summarization

Runtime error

App Files Files Community

pszemraj commited on May 29, 2023

Commit

ca983bc

•

1 Parent(s): f84fce9

⚗️ ⚡️ better stopwords and splitting

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

utils.py +39 -21

utils.py CHANGED Viewed

@@ -19,11 +19,11 @@ logging.basicConfig(
 import torch
 from natsort import natsorted
-from nltk.tokenize import word_tokenize, WhitespaceTokenizer
 from rapidfuzz import fuzz
 STOPWORDS = set(
-    "a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
 )
@@ -66,30 +66,48 @@ def remove_stopwords(
     :param bool contraction_tokenize: use custom apostrophe tokenizer, defaults to True
     :return str: text with stopwords removed
     """
-    words = (
-        contraction_aware_tokenize(text)
-        if contraction_tokenize
-        else word_tokenize(text)
-    )
-    filtered_words = []
-    for word in words:
-        # Remove leading and trailing punctuation marks
-        word = word.strip(string.punctuation)
-        if word.lower() not in stopwords:
-            filtered_words.append(word)
-    filtered_text = " ".join(filtered_words)
-    # Replace multiple consecutive whitespaces with a single space
-    filtered_text = re.sub(r"\s+", " ", filtered_text)
-    filtered_text = filtered_text.strip()
-    # Restore original whitespaces around punctuation marks
-    filtered_text = re.sub(
-        r"\s*([{}])\s*".format(re.escape(string.punctuation)), r"\1", filtered_text
-    )
     return filtered_text

 import torch
 from natsort import natsorted
+from nltk.tokenize import word_tokenize, WhitespaceTokenizer, sent_tokenize
 from rapidfuzz import fuzz
 STOPWORDS = set(
+    "a about above after again all also am an and any are aren't as at back be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he'd he'll he's hence her here here's hers herself him himself his how how's however i'd i'll i'm i've if in into is isn't it's its itself just let's me more moreover most mustn't my myself new nor now of off on once only or other ought our ours ourselves out over own really same shan't she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's therefore these they they'd they'll they're they've this those through thus to too under until up use used using very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you'd you'll you're you've your yours yourself yourselves".split()
 )
     :param bool contraction_tokenize: use custom apostrophe tokenizer, defaults to True
     :return str: text with stopwords removed
     """
+    lines = text.split("\n")
+    filtered_lines = []
+    def fix_commas(text: str) -> str:
+        """fixes commas in text to have a space after them"""
+        spaced_text = text.replace(",", ", ")
+        return spaced_text.replace("  ", " ").strip()
+    for line in lines:
+        sentences = sent_tokenize(line)
+        filtered_sentences = []
+        for sentence in sentences:
+            # Add space around punctuations for the regex to work correctly, only if they are followed by a letter
+            sentence_with_spaces = re.sub(r"([.,!?])(\w)", r"\1 \2", sentence[:-1])
+            words = (
+                contraction_aware_tokenize(sentence_with_spaces)
+                if contraction_tokenize
+                else word_tokenize(sentence_with_spaces)
+            )
+            filtered_words = []
+            for word in words:
+                if word.lower() not in stopwords:
+                    filtered_words.append(word)
+            filtered_sentence = " ".join(filtered_words)
+            # Restore original spaces around punctuation marks
+            filtered_sentence = re.sub(r"([.,!?])\s*", r"\1", filtered_sentence)
+            filtered_sentences.append(filtered_sentence + sentence[-1])
+        filtered_line = " ".join(filtered_sentences)
+        # Replace multiple consecutive whitespaces with a single space
+        filtered_line = re.sub(r"\s+", " ", filtered_line)
+        filtered_line = fix_commas(filtered_line.strip())
+        filtered_lines.append(filtered_line)
+    filtered_text = "\n".join(filtered_lines)
     return filtered_text