Spaces:
Build error
Build error
sdhanabal1
commited on
Commit
•
4372d93
1
Parent(s):
c98407b
Update min length to 24 tokens
Browse files- Summarizer.py +6 -9
- app.py +1 -1
- test_summarizer.py +3 -3
Summarizer.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import string
|
2 |
import nltk
|
3 |
|
4 |
from sumy.parsers import DocumentParser
|
@@ -13,8 +12,7 @@ from transformers import Pipeline
|
|
13 |
|
14 |
class Summarizer:
|
15 |
DEFAULT_LANGUAGE = "english"
|
16 |
-
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH =
|
17 |
-
STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
|
18 |
|
19 |
def __init__(self, pipeline: Pipeline):
|
20 |
self.pipeline = pipeline
|
@@ -34,16 +32,15 @@ class Summarizer:
|
|
34 |
return " ".join([sentence for sentence in summary_sentences])
|
35 |
|
36 |
@staticmethod
|
37 |
-
def split_sentences_by_token_length(summary_sentences: list,
|
38 |
accumulated_lists = []
|
39 |
result_list = []
|
40 |
cumulative_token_length = 0
|
41 |
for sentence in summary_sentences:
|
42 |
result_list.append(sentence)
|
43 |
token_list = nltk.word_tokenize(sentence)
|
44 |
-
|
45 |
-
token_length
|
46 |
-
if token_length + cumulative_token_length >= max_token_length:
|
47 |
accumulated_lists.append(Summarizer.join_sentences(result_list))
|
48 |
result_list = []
|
49 |
cumulative_token_length = 0
|
@@ -72,9 +69,9 @@ class Summarizer:
|
|
72 |
:return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
|
73 |
"""
|
74 |
wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
|
75 |
-
|
76 |
# The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
|
77 |
abstractive_summary_list = []
|
78 |
-
for result in self.pipeline(wrapped_sentences, min_length=
|
79 |
abstractive_summary_list.append(result['summary_text'])
|
80 |
return abstractive_summary_list
|
|
|
|
|
1 |
import nltk
|
2 |
|
3 |
from sumy.parsers import DocumentParser
|
|
|
12 |
|
13 |
class Summarizer:
|
14 |
DEFAULT_LANGUAGE = "english"
|
15 |
+
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 15
|
|
|
16 |
|
17 |
def __init__(self, pipeline: Pipeline):
|
18 |
self.pipeline = pipeline
|
|
|
32 |
return " ".join([sentence for sentence in summary_sentences])
|
33 |
|
34 |
@staticmethod
|
35 |
+
def split_sentences_by_token_length(summary_sentences: list, split_token_length: int) -> list:
|
36 |
accumulated_lists = []
|
37 |
result_list = []
|
38 |
cumulative_token_length = 0
|
39 |
for sentence in summary_sentences:
|
40 |
result_list.append(sentence)
|
41 |
token_list = nltk.word_tokenize(sentence)
|
42 |
+
token_length = len(token_list)
|
43 |
+
if token_length + cumulative_token_length >= split_token_length:
|
|
|
44 |
accumulated_lists.append(Summarizer.join_sentences(result_list))
|
45 |
result_list = []
|
46 |
cumulative_token_length = 0
|
|
|
69 |
:return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
|
70 |
"""
|
71 |
wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
|
72 |
+
split_token_length=512)
|
73 |
# The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
|
74 |
abstractive_summary_list = []
|
75 |
+
for result in self.pipeline(wrapped_sentences, min_length=24, max_length=512):
|
76 |
abstractive_summary_list.append(result['summary_text'])
|
77 |
return abstractive_summary_list
|
app.py
CHANGED
@@ -19,7 +19,7 @@ def main() -> None:
|
|
19 |
"Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True)
|
20 |
st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>'
|
21 |
'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>'
|
22 |
-
'
|
23 |
st.markdown('<b>Want to find out more?</b> :brain:<br>'
|
24 |
'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
|
25 |
'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)
|
|
|
19 |
"Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True)
|
20 |
st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>'
|
21 |
'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>'
|
22 |
+
'The abstractive summary will give you an idea of what the key message of the document likely is :bulb:', unsafe_allow_html=True)
|
23 |
st.markdown('<b>Want to find out more?</b> :brain:<br>'
|
24 |
'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
|
25 |
'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)
|
test_summarizer.py
CHANGED
@@ -8,19 +8,19 @@ def test_split_sentences_by_token_length():
|
|
8 |
'Free.'
|
9 |
]
|
10 |
|
11 |
-
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences,
|
12 |
assert split_sentences == [
|
13 |
'Python is a programming language.',
|
14 |
'Memory allocation. Free.'
|
15 |
]
|
16 |
|
17 |
-
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences,
|
18 |
assert split_sentences == [
|
19 |
'Python is a programming language. Memory allocation.',
|
20 |
'Free.'
|
21 |
]
|
22 |
|
23 |
-
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences,
|
24 |
assert split_sentences == [
|
25 |
'Python is a programming language. Memory allocation. Free.'
|
26 |
]
|
|
|
8 |
'Free.'
|
9 |
]
|
10 |
|
11 |
+
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=5)
|
12 |
assert split_sentences == [
|
13 |
'Python is a programming language.',
|
14 |
'Memory allocation. Free.'
|
15 |
]
|
16 |
|
17 |
+
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=7)
|
18 |
assert split_sentences == [
|
19 |
'Python is a programming language. Memory allocation.',
|
20 |
'Free.'
|
21 |
]
|
22 |
|
23 |
+
split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, split_token_length=10)
|
24 |
assert split_sentences == [
|
25 |
'Python is a programming language. Memory allocation. Free.'
|
26 |
]
|