Spaces:
Sleeping
Sleeping
🚸 kw based file naming
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
utils.py
CHANGED
@@ -9,6 +9,12 @@ from pathlib import Path
|
|
9 |
|
10 |
import torch
|
11 |
from natsort import natsorted
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
def validate_pytorch2(torch_version: str = None):
|
@@ -88,6 +94,57 @@ def load_example_filenames(example_path: str or Path):
|
|
88 |
return examples
|
89 |
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
def saves_summary(
|
92 |
summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
|
93 |
):
|
@@ -99,16 +156,18 @@ def saves_summary(
|
|
99 |
add_signature: whether to add a signature to the output file
|
100 |
kwargs: additional keyword arguments to include in the output file
|
101 |
"""
|
102 |
-
outpath = (
|
103 |
-
Path.cwd() / f"document_summary_{get_timestamp()}.txt"
|
104 |
-
if outpath is None
|
105 |
-
else Path(outpath)
|
106 |
-
)
|
107 |
sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
|
108 |
sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
|
109 |
scores_text = "\n".join(sum_scores)
|
110 |
full_summary = "\n".join(sum_text)
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
with open(
|
113 |
outpath,
|
114 |
"w",
|
|
|
9 |
|
10 |
import torch
|
11 |
from natsort import natsorted
|
12 |
+
from typing import List
|
13 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
14 |
+
from itertools import combinations
|
15 |
+
from collections import defaultdict
|
16 |
+
from rapidfuzz import fuzz
|
17 |
+
from nltk.corpus import stopwords
|
18 |
|
19 |
|
20 |
def validate_pytorch2(torch_version: str = None):
|
|
|
94 |
return examples
|
95 |
|
96 |
|
97 |
+
def extract_keywords(text: str, num_keywords: int = 3) -> List[str]:
|
98 |
+
"""
|
99 |
+
Extracts keywords from a text using the TextRank algorithm.
|
100 |
+
|
101 |
+
Args:
|
102 |
+
text: The text to extract keywords from.
|
103 |
+
num_keywords: The number of keywords to extract. Default is 5.
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
A list of strings, where each string is a keyword extracted from the input text.
|
107 |
+
"""
|
108 |
+
# Remove stopwords from the input text
|
109 |
+
stop_words = set(stopwords.words("english"))
|
110 |
+
text = " ".join([word for word in text.lower().split() if word not in stop_words])
|
111 |
+
|
112 |
+
# Tokenize the text into sentences and words
|
113 |
+
sentences = sent_tokenize(text)
|
114 |
+
words = [word_tokenize(sentence) for sentence in sentences]
|
115 |
+
|
116 |
+
# Filter out words that are shorter than 3 characters
|
117 |
+
words = [[word for word in sentence if len(word) >= 3] for sentence in words]
|
118 |
+
|
119 |
+
# Create a graph of word co-occurrences
|
120 |
+
cooccur = defaultdict(lambda: defaultdict(int))
|
121 |
+
for sentence in words:
|
122 |
+
for w1, w2 in combinations(sentence, 2):
|
123 |
+
cooccur[w1][w2] += 1
|
124 |
+
cooccur[w2][w1] += 1
|
125 |
+
|
126 |
+
# Assign scores to words using the TextRank algorithm
|
127 |
+
scores = defaultdict(float)
|
128 |
+
for i in range(10):
|
129 |
+
for word in cooccur:
|
130 |
+
score = 0.15 + 0.85 * sum(
|
131 |
+
cooccur[word][other] / sum(cooccur[other].values()) * scores[other]
|
132 |
+
for other in cooccur[word]
|
133 |
+
)
|
134 |
+
scores[word] = score
|
135 |
+
|
136 |
+
# Sort the words by score and return the top num_keywords keywords
|
137 |
+
keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
|
138 |
+
|
139 |
+
# Use fuzzy matching to remove similar keywords
|
140 |
+
final_keywords = []
|
141 |
+
for keyword in keywords:
|
142 |
+
if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
|
143 |
+
final_keywords.append(keyword)
|
144 |
+
|
145 |
+
return final_keywords
|
146 |
+
|
147 |
+
|
148 |
def saves_summary(
|
149 |
summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
|
150 |
):
|
|
|
156 |
add_signature: whether to add a signature to the output file
|
157 |
kwargs: additional keyword arguments to include in the output file
|
158 |
"""
|
|
|
|
|
|
|
|
|
|
|
159 |
sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
|
160 |
sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
|
161 |
scores_text = "\n".join(sum_scores)
|
162 |
full_summary = "\n".join(sum_text)
|
163 |
|
164 |
+
keywords = "_".join(extract_keywords(full_summary))
|
165 |
+
outpath = (
|
166 |
+
Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt"
|
167 |
+
if outpath is None
|
168 |
+
else Path(outpath)
|
169 |
+
)
|
170 |
+
|
171 |
with open(
|
172 |
outpath,
|
173 |
"w",
|