pszemraj commited on
Commit
e9ed1f2
1 Parent(s): 9d26661

⚡️ drop nltk for kw

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (2) hide show
  1. app.py +1 -4
  2. utils.py +32 -25
app.py CHANGED
@@ -36,10 +36,7 @@ _here = Path(__file__).parent
36
 
37
  # os.environ["NLTK_DATA"] = str(_here / "nltk_data")
38
  nltk.download("punkt", force=True, quiet=True)
39
- nltk.download(
40
- "popular",
41
- force=True,
42
- )
43
 
44
 
45
  MODEL_OPTIONS = [
 
36
 
37
  # os.environ["NLTK_DATA"] = str(_here / "nltk_data")
38
  nltk.download("punkt", force=True, quiet=True)
39
+ nltk.download("popular", force=True, quiet=True)
 
 
 
40
 
41
 
42
  MODEL_OPTIONS = [
utils.py CHANGED
@@ -17,11 +17,11 @@ from nltk.corpus import stopwords
17
  from nltk.tokenize import sent_tokenize, word_tokenize
18
  from rapidfuzz import fuzz
19
 
20
- nltk.download("punkt", quiet=True)
21
- nltk.download(
22
- "popular",
23
- quiet=True,
24
- )
25
 
26
 
27
  def validate_pytorch2(torch_version: str = None):
@@ -101,44 +101,51 @@ def load_example_filenames(example_path: str or Path):
101
  return examples
102
 
103
 
104
- def extract_keywords(text: str, num_keywords: int = 3) -> List[str]:
 
 
105
  """
106
- Extracts keywords from a text using the TextRank algorithm.
107
 
108
  Args:
109
  text: The text to extract keywords from.
110
  num_keywords: The number of keywords to extract. Default is 5.
 
111
 
112
  Returns:
113
  A list of strings, where each string is a keyword extracted from the input text.
114
  """
115
- # Remove stopwords from the input text
116
- stop_words = set(stopwords.words("english"))
117
- text = " ".join([word for word in text.lower().split() if word not in stop_words])
118
-
119
- # Tokenize the text into sentences and words
120
- sentences = sent_tokenize(text)
121
- words = [word_tokenize(sentence) for sentence in sentences]
122
 
123
- # Filter out words that are shorter than 3 characters
124
- words = [[word for word in sentence if len(word) >= 3] for sentence in words]
 
 
 
 
125
 
126
- # Create a graph of word co-occurrences
127
  cooccur = defaultdict(lambda: defaultdict(int))
128
- for sentence in words:
129
- for w1, w2 in combinations(sentence, 2):
 
130
  cooccur[w1][w2] += 1
131
  cooccur[w2][w1] += 1
 
132
 
133
- # Assign scores to words using the TextRank algorithm
134
  scores = defaultdict(float)
135
- for i in range(10):
136
- for word in cooccur:
137
- score = 0.15 + 0.85 * sum(
 
138
  cooccur[word][other] / sum(cooccur[other].values()) * scores[other]
139
- for other in cooccur[word]
140
  )
141
- scores[word] = score
142
 
143
  # Sort the words by score and return the top num_keywords keywords
144
  keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
 
17
  from nltk.tokenize import sent_tokenize, word_tokenize
18
  from rapidfuzz import fuzz
19
 
20
+ import re
21
+ from typing import List
22
+ from itertools import islice
23
+ from collections import defaultdict, deque
24
+ from rapidfuzz import fuzz
25
 
26
 
27
  def validate_pytorch2(torch_version: str = None):
 
101
  return examples
102
 
103
 
104
+ def extract_keywords(
105
+ text: str, num_keywords: int = 3, window_size: int = 5
106
+ ) -> List[str]:
107
  """
108
+ Extracts keywords from a text using a simplified TextRank algorithm.
109
 
110
  Args:
111
  text: The text to extract keywords from.
112
  num_keywords: The number of keywords to extract. Default is 5.
113
+ window_size: The number of words considered for co-occurrence. Default is 5.
114
 
115
  Returns:
116
  A list of strings, where each string is a keyword extracted from the input text.
117
  """
118
+ # Define stopwords
119
+ stop_words = set(
120
+ "a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
121
+ )
 
 
 
122
 
123
+ # Remove stopwords and tokenize the text into words
124
+ words = [
125
+ word
126
+ for word in re.findall(r"\b\w{3,}\b", text.lower())
127
+ if word not in stop_words
128
+ ]
129
 
130
+ # Create a graph of word co-occurrences within a moving window of words
131
  cooccur = defaultdict(lambda: defaultdict(int))
132
+ deque_words = deque(maxlen=window_size)
133
+ for word in words:
134
+ for w1, w2 in combinations(deque_words, 2):
135
  cooccur[w1][w2] += 1
136
  cooccur[w2][w1] += 1
137
+ deque_words.append(word)
138
 
139
+ # Assign scores to words using a simplified TextRank algorithm
140
  scores = defaultdict(float)
141
+ for _ in range(10):
142
+ new_scores = defaultdict(float)
143
+ for word, co_words in cooccur.items():
144
+ new_scores[word] = 0.15 + 0.85 * sum(
145
  cooccur[word][other] / sum(cooccur[other].values()) * scores[other]
146
+ for other in co_words
147
  )
148
+ scores = new_scores
149
 
150
  # Sort the words by score and return the top num_keywords keywords
151
  keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]