sdhanabal1 commited on
Commit
ce42613
1 Parent(s): 91466d8

Address PR review comments

Browse files
Files changed (3) hide show
  1. Summarizer.py +1 -0
  2. app.py +88 -87
  3. requirements.txt +3 -3
Summarizer.py CHANGED
@@ -12,6 +12,7 @@ from transformers import Pipeline
12
 
13
  class Summarizer:
14
  DEFAULT_LANGUAGE = "english"
 
15
 
16
  def __init__(self, pipeline: Pipeline):
17
  self.pipeline = pipeline
 
12
 
13
  class Summarizer:
14
  DEFAULT_LANGUAGE = "english"
15
+ DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
16
 
17
  def __init__(self, pipeline: Pipeline):
18
  self.pipeline = pipeline
app.py CHANGED
@@ -6,99 +6,100 @@ from validators import ValidationFailure
6
 
7
  from Summarizer import Summarizer
8
 
9
- nltk.download('punkt')
10
-
11
- DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
12
-
13
- st.markdown('# Terms & conditions summarization :pencil:')
14
- st.write('Do you also take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up for a new app? :thinking_face: \nNo?'
15
- 'Well have we got a demo for you!'
16
- 'Just copy-paste the lengthy Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!'
17
- 'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)'
18
- 'The abstractive summarization is preceded by LSA (Latent Semantic Analysis) extractive summarization')
19
- st.write('Want to find out more?'
20
- 'For information about the extractive summarization :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis'
21
- 'For information about the abstractive summarization :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr')
22
-
23
- st.markdown("""
24
- To use this:
25
- - Number of sentences to be extracted is configurable
26
- - Specify an URL to extract contents OR copy terms & conditions content and hit 'Summarize'
27
- """)
28
-
29
-
30
- @st.cache(allow_output_mutation=True,
31
- suppress_st_warning=True,
32
- show_spinner=False)
33
- def create_pipeline():
34
- with st.spinner('Please wait for the model to load...'):
35
- terms_and_conditions_pipeline = pipeline(
36
- task='summarization',
37
- model='ml6team/distilbart-tos-summarizer-tosdr',
38
- tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
39
- )
40
- return terms_and_conditions_pipeline
41
-
42
-
43
- def display_abstractive_summary(summary) -> None:
44
- st.subheader("Abstractive Summary")
45
- st.markdown('#####')
46
- st.markdown(summary)
47
-
48
-
49
- def display_extractive_summary(terms_and_conditions_sentences: list, summary_sentences: list) -> None:
50
- st.subheader("Extractive Summary")
51
- st.markdown('#####')
52
- terms_and_conditions = " ".join(sentence for sentence in terms_and_conditions_sentences)
53
- replaced_text = terms_and_conditions
54
- for sentence in summary_sentences:
55
- replaced_text = replaced_text.replace(sentence, f"<span style='background-color: #FFFF00'>{sentence}</span>")
56
- st.write(replaced_text, unsafe_allow_html=True)
57
-
58
 
59
- def is_valid_url(url: str) -> bool:
60
- result = validators.url(url)
61
- if isinstance(result, ValidationFailure):
62
- return False
63
- return True
64
-
65
-
66
- summarizer: Summarizer = Summarizer(create_pipeline())
67
-
68
- if 'tc_text' not in st.session_state:
69
- st.session_state['tc_text'] = ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- if 'sentences_length' not in st.session_state:
72
- st.session_state['sentences_length'] = DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH
73
 
74
- st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)
75
- st.header("Input")
76
 
77
- with st.form(key='terms-and-conditions'):
78
- sentences_length_input = st.number_input(
79
- label='Number of sentences to be extracted:',
80
- min_value=1,
81
- value=st.session_state.sentences_length
82
- )
83
- tc_text_input = st.text_area(
84
- value=st.session_state.tc_text,
85
- label='Terms & conditions content or specify an URL:',
86
- height=240
87
- )
88
 
89
- submit_button = st.form_submit_button(label='Summarize')
 
90
 
91
- if submit_button:
 
92
 
93
- if is_valid_url(tc_text_input):
94
- (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_url(tc_text_input,
95
- sentences_length_input)
96
- else:
97
- (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_text(tc_text_input,
98
- sentences_length_input)
99
 
100
- extract_summary = " ".join([sentence for sentence in extract_summary_sentences])
101
- abstract_summary = summarizer.abstractive_summary(extract_summary)
102
 
103
- display_extractive_summary(all_sentences, extract_summary_sentences)
104
- display_abstractive_summary(abstract_summary)
 
6
 
7
  from Summarizer import Summarizer
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ def main() -> None:
11
+ nltk.download('punkt')
12
+
13
+ st.markdown('# Terms & conditions summarization :pencil:')
14
+ st.write('Do you also take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up for a new app? :thinking_face: \nNo?'
15
+ 'Well have we got a demo for you!'
16
+ 'Just copy-paste the lengthy Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!'
17
+ 'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)'
18
+ 'The abstractive summarization is preceded by LSA (Latent Semantic Analysis) extractive summarization')
19
+ st.write('Want to find out more?'
20
+ 'For information about the extractive summarization :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis'
21
+ 'For information about the abstractive summarization :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr')
22
+
23
+ st.markdown("""
24
+ To use this:
25
+ - Number of sentences to be extracted is configurable
26
+ - Specify an URL to extract contents OR copy terms & conditions content and hit 'Summarize'
27
+ """)
28
+
29
+ @st.cache(allow_output_mutation=True,
30
+ suppress_st_warning=True,
31
+ show_spinner=False)
32
+ def create_pipeline():
33
+ with st.spinner('Please wait for the model to load...'):
34
+ terms_and_conditions_pipeline = pipeline(
35
+ task='summarization',
36
+ model='ml6team/distilbart-tos-summarizer-tosdr',
37
+ tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
38
+ )
39
+ return terms_and_conditions_pipeline
40
+
41
+ def display_abstractive_summary(summary) -> None:
42
+ st.subheader("Abstractive Summary")
43
+ st.markdown('#####')
44
+ st.markdown(summary)
45
+
46
+ def display_extractive_summary(terms_and_conditions_sentences: list, summary_sentences: list) -> None:
47
+ st.subheader("Extractive Summary")
48
+ st.markdown('#####')
49
+ terms_and_conditions = " ".join(sentence for sentence in terms_and_conditions_sentences)
50
+ replaced_text = terms_and_conditions
51
+ for sentence in summary_sentences:
52
+ replaced_text = replaced_text.replace(sentence,
53
+ f"<span style='background-color: #FFFF00'>{sentence}</span>")
54
+ st.write(replaced_text, unsafe_allow_html=True)
55
+
56
+ def is_valid_url(url: str) -> bool:
57
+ result = validators.url(url)
58
+ if isinstance(result, ValidationFailure):
59
+ return False
60
+ return True
61
+
62
+ summarizer: Summarizer = Summarizer(create_pipeline())
63
+
64
+ if 'tc_text' not in st.session_state:
65
+ st.session_state['tc_text'] = ''
66
+
67
+ if 'sentences_length' not in st.session_state:
68
+ st.session_state['sentences_length'] = Summarizer.DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH
69
+
70
+ st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)
71
+ st.header("Input")
72
+
73
+ with st.form(key='terms-and-conditions'):
74
+ sentences_length_input = st.number_input(
75
+ label='Number of sentences to be extracted:',
76
+ min_value=1,
77
+ value=st.session_state.sentences_length
78
+ )
79
+ tc_text_input = st.text_area(
80
+ value=st.session_state.tc_text,
81
+ label='Terms & conditions content or specify an URL:',
82
+ height=240
83
+ )
84
 
85
+ submit_button = st.form_submit_button(label='Summarize')
 
86
 
87
+ if submit_button:
 
88
 
89
+ if is_valid_url(tc_text_input):
90
+ (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_url(tc_text_input,
91
+ sentences_length_input)
92
+ else:
93
+ (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_text(tc_text_input,
94
+ sentences_length_input)
 
 
 
 
 
95
 
96
+ extract_summary = " ".join([sentence for sentence in extract_summary_sentences])
97
+ abstract_summary = summarizer.abstractive_summary(extract_summary)
98
 
99
+ display_extractive_summary(all_sentences, extract_summary_sentences)
100
+ display_abstractive_summary(abstract_summary)
101
 
 
 
 
 
 
 
102
 
103
+ if __name__ == "__main__":
104
+ main()
105
 
 
 
requirements.txt CHANGED
@@ -2,7 +2,7 @@ nlpaug==1.1.7
2
  streamlit
3
  torch==1.9.1
4
  torchvision==0.10.1
5
- transformers
6
  sumy==0.9.0
7
- nltk
8
- validators
 
2
  streamlit
3
  torch==1.9.1
4
  torchvision==0.10.1
5
+ transformers==4.10.3
6
  sumy==0.9.0
7
+ nltk==3.6.7
8
+ validators==0.18.2