Spaces:
Running
Running
🚧 add stopword removal fn
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
utils.py
CHANGED
@@ -15,8 +15,10 @@ logging.basicConfig(
|
|
15 |
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
16 |
level=logging.INFO,
|
17 |
)
|
|
|
18 |
import torch
|
19 |
from natsort import natsorted
|
|
|
20 |
from rapidfuzz import fuzz
|
21 |
|
22 |
# Define stopwords
|
@@ -25,6 +27,28 @@ STOPWORDS = set(
|
|
25 |
)
|
26 |
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def remove_stagnant_files(
|
29 |
freq: str = "hourly",
|
30 |
search_path: str = ".",
|
|
|
15 |
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
16 |
level=logging.INFO,
|
17 |
)
|
18 |
+
|
19 |
import torch
|
20 |
from natsort import natsorted
|
21 |
+
from nltk.tokenize import word_tokenize
|
22 |
from rapidfuzz import fuzz
|
23 |
|
24 |
# Define stopwords
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def remove_stopwords(text: str, stopwords: list = STOPWORDS) -> str:
|
31 |
+
"""
|
32 |
+
remove_stopwords - Remove stopwords from a string.
|
33 |
+
|
34 |
+
:param str text: text to remove stopwords from
|
35 |
+
:param list stopwords: list of stopwords to remove, defaults to STOPWORDS
|
36 |
+
:return, str: text with stopwords removed
|
37 |
+
"""
|
38 |
+
words = word_tokenize(text)
|
39 |
+
filtered_words = []
|
40 |
+
|
41 |
+
for word in words:
|
42 |
+
word = word.strip(string.punctuation) # remove punctuation
|
43 |
+
|
44 |
+
if word.lower() not in stopwords:
|
45 |
+
filtered_words.append(word)
|
46 |
+
|
47 |
+
filtered_text = " ".join(filtered_words)
|
48 |
+
|
49 |
+
return filtered_text
|
50 |
+
|
51 |
+
|
52 |
def remove_stagnant_files(
|
53 |
freq: str = "hourly",
|
54 |
search_path: str = ".",
|