sdhanabal1 commited on
Commit
abcaca9
1 Parent(s): 8d4dd5e

Refactor and highlight extract summary

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. Summarizer.py +45 -0
  3. app.py +46 -58
  4. requirements.txt +2 -1
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  .idea/
2
  venv/
 
1
  .idea/
2
  venv/
3
+ __pycache__/
Summarizer.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textwrap import wrap
2
+
3
+ from sumy.parsers import DocumentParser
4
+ from sumy.parsers.html import HtmlParser
5
+ from sumy.parsers.plaintext import PlaintextParser
6
+ from sumy.nlp.tokenizers import Tokenizer
7
+ from sumy.nlp.stemmers import Stemmer
8
+ from sumy.summarizers.lsa import LsaSummarizer
9
+ from sumy.utils import get_stop_words
10
+ from transformers import Pipeline
11
+
12
+
13
+ class Summarizer:
14
+ DEFAULT_LANGUAGE = "english"
15
+
16
+ def __init__(self, pipeline: Pipeline):
17
+ self.pipeline = pipeline
18
+ stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE)
19
+ self.lsa_summarizer = LsaSummarizer(stemmer)
20
+ self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE)
21
+
22
+ @staticmethod
23
+ def sentence_list(summarized_sentences) -> list:
24
+ summarized_list = []
25
+ for sentence in summarized_sentences:
26
+ summarized_list.append(sentence._text)
27
+ return summarized_list
28
+
29
+ def __extractive_summary(self, parser: DocumentParser, sentences_count):
30
+ summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
31
+ summarized_list = Summarizer.sentence_list(summarized_sentences)
32
+ all_sentences_list = Summarizer.sentence_list(parser.document.sentences)
33
+ return all_sentences_list, summarized_list
34
+
35
+ def extractive_summary_from_text(self, text: str, sentences_count: int) -> (list, list):
36
+ parser = PlaintextParser.from_string(text, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
37
+ return self.__extractive_summary(parser, sentences_count)
38
+
39
+ def extractive_summary_from_url(self, url: str, sentences_count: int) -> (list, list):
40
+ parser = HtmlParser.from_url(url, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
41
+ return self.__extractive_summary(parser, sentences_count)
42
+
43
+ def abstractive_summary(self, summary: str) -> str:
44
+ summary_text = " ".join([result['summary_text'] for result in self.pipeline(wrap(summary, 2048))])
45
+ return summary_text
app.py CHANGED
@@ -1,21 +1,14 @@
1
- from textwrap import wrap
2
- from transformers import pipeline
3
  import streamlit as st
 
 
 
4
 
5
- from sumy.parsers.plaintext import PlaintextParser
6
- from sumy.nlp.tokenizers import Tokenizer
7
- from sumy.nlp.stemmers import Stemmer
8
- from sumy.summarizers.lsa import LsaSummarizer
9
- from sumy.utils import get_stop_words
10
 
11
- import nltk
12
  nltk.download('punkt')
13
 
14
- DEFAULT_LANGUAGE = "english"
15
  DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
16
- stemmer = Stemmer(DEFAULT_LANGUAGE)
17
- lsa_summarizer = LsaSummarizer(stemmer)
18
- lsa_summarizer.stop_words = get_stop_words(language=DEFAULT_LANGUAGE)
19
 
20
  st.markdown('# Terms & conditions abstractive summarization model :pencil:')
21
  st.write('This app provides the abstract summary of the provided terms & conditions. '
@@ -25,14 +18,14 @@ st.write('Information about the model :point_right: https://huggingface.co/ml6te
25
  st.markdown("""
26
  To use this:
27
  - Number of sentences to be extracted is configurable
28
- - Copy terms & conditions and hit 'Summarize'
29
  """)
30
 
31
 
32
  @st.cache(allow_output_mutation=True,
33
  suppress_st_warning=True,
34
  show_spinner=False)
35
- def load_model():
36
  with st.spinner('Please wait for the model to load...'):
37
  terms_and_conditions_pipeline = pipeline(
38
  task='summarization',
@@ -42,15 +35,40 @@ def load_model():
42
  return terms_and_conditions_pipeline
43
 
44
 
45
- tc_pipeline = load_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  if 'tc_text' not in st.session_state:
48
- st.session_state['tc_text'] = ""
49
 
50
  if 'sentences_length' not in st.session_state:
51
  st.session_state['sentences_length'] = DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH
52
 
 
53
  st.header("Input")
 
54
  with st.form(key='terms-and-conditions'):
55
  sentences_length_input = st.number_input(
56
  label='Number of sentences to be extracted:',
@@ -59,53 +77,23 @@ with st.form(key='terms-and-conditions'):
59
  )
60
  tc_text_input = st.text_area(
61
  value=st.session_state.tc_text,
62
- label='Terms & conditions text:',
63
- height=240
64
- )
65
- submit_button = st.form_submit_button(label='Summarize')
66
-
67
- st.header("Output")
68
-
69
-
70
- def generate_abstractive_summary(summary) -> str:
71
- summary_text = " ".join([result['summary_text'] for result in tc_pipeline(wrap(summary, 2048))])
72
- return summary_text
73
-
74
-
75
- def generate_extractive_summary(text, sentences_count: int) -> str:
76
- parser = PlaintextParser.from_string(text, Tokenizer(DEFAULT_LANGUAGE))
77
- summarized_sentences = lsa_summarizer(parser.document, sentences_count)
78
- summarized_text = " ".join([sentence._text for sentence in summarized_sentences])
79
- return summarized_text
80
-
81
-
82
- def display_abstractive_summary(summary) -> None:
83
- st.subheader("Abstractive Summary")
84
- st.markdown('#####')
85
- st.text_area(
86
- value=summary,
87
- label='',
88
- height=240
89
- )
90
-
91
-
92
- def display_extractive_summary(summary) -> None:
93
- st.subheader("Extractive Summary")
94
- st.markdown('#####')
95
- st.text_area(
96
- value=summary,
97
- label='',
98
  height=240
99
  )
100
 
 
101
 
102
  if submit_button:
103
- tc_text = tc_text_input
104
- sentences_length = sentences_length_input
105
 
106
- extract_summary = generate_extractive_summary(tc_text, sentences_length)
107
- abstract_summary = generate_abstractive_summary(extract_summary)
 
 
 
 
108
 
109
- display_extractive_summary(extract_summary)
110
- display_abstractive_summary(abstract_summary)
111
 
 
 
1
+ import nltk
 
2
  import streamlit as st
3
+ import validators
4
+ from transformers import pipeline
5
+ from validators import ValidationFailure
6
 
7
+ from Summarizer import Summarizer
 
 
 
 
8
 
 
9
  nltk.download('punkt')
10
 
 
11
  DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
 
 
 
12
 
13
  st.markdown('# Terms & conditions abstractive summarization model :pencil:')
14
  st.write('This app provides the abstract summary of the provided terms & conditions. '
18
  st.markdown("""
19
  To use this:
20
  - Number of sentences to be extracted is configurable
21
+ - Specify an URL to extract contents OR copy terms & conditions content and hit 'Summarize'
22
  """)
23
 
24
 
25
  @st.cache(allow_output_mutation=True,
26
  suppress_st_warning=True,
27
  show_spinner=False)
28
+ def create_pipeline():
29
  with st.spinner('Please wait for the model to load...'):
30
  terms_and_conditions_pipeline = pipeline(
31
  task='summarization',
35
  return terms_and_conditions_pipeline
36
 
37
 
38
+ def display_abstractive_summary(summary) -> None:
39
+ st.subheader("Abstractive Summary")
40
+ st.markdown('#####')
41
+ st.markdown(summary)
42
+
43
+
44
+ def display_extractive_summary(terms_and_conditions_sentences: list, summary_sentences: list) -> None:
45
+ st.subheader("Extractive Summary")
46
+ st.markdown('#####')
47
+ terms_and_conditions = " ".join(sentence for sentence in terms_and_conditions_sentences)
48
+ replaced_text = terms_and_conditions
49
+ for sentence in summary_sentences:
50
+ replaced_text = replaced_text.replace(sentence, f"<span style='background-color: #FFFF00'>{sentence}</span>")
51
+ st.write(replaced_text, unsafe_allow_html=True)
52
+
53
+
54
+ def is_valid_url(url: str) -> bool:
55
+ result = validators.url(url)
56
+ if isinstance(result, ValidationFailure):
57
+ return False
58
+ return True
59
+
60
+
61
+ summarizer: Summarizer = Summarizer(create_pipeline())
62
 
63
  if 'tc_text' not in st.session_state:
64
+ st.session_state['tc_text'] = ''
65
 
66
  if 'sentences_length' not in st.session_state:
67
  st.session_state['sentences_length'] = DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH
68
 
69
+ st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)
70
  st.header("Input")
71
+
72
  with st.form(key='terms-and-conditions'):
73
  sentences_length_input = st.number_input(
74
  label='Number of sentences to be extracted:',
77
  )
78
  tc_text_input = st.text_area(
79
  value=st.session_state.tc_text,
80
+ label='Terms & conditions content or specify an URL:',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  height=240
82
  )
83
 
84
+ submit_button = st.form_submit_button(label='Summarize')
85
 
86
  if submit_button:
 
 
87
 
88
+ if is_valid_url(tc_text_input):
89
+ (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_url(tc_text_input,
90
+ sentences_length_input)
91
+ else:
92
+ (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_text(tc_text_input,
93
+ sentences_length_input)
94
 
95
+ extract_summary = " ".join([sentence for sentence in extract_summary_sentences])
96
+ abstract_summary = summarizer.abstractive_summary(extract_summary)
97
 
98
+ display_extractive_summary(all_sentences, extract_summary_sentences)
99
+ display_abstractive_summary(abstract_summary)
requirements.txt CHANGED
@@ -4,4 +4,5 @@ torch==1.9.1
4
  torchvision==0.10.1
5
  transformers
6
  sumy==0.9.0
7
- nltk
 
4
  torchvision==0.10.1
5
  transformers
6
  sumy==0.9.0
7
+ nltk
8
+ validators