wing-nus commited on
Commit
3b85924
1 Parent(s): d76b2ba

Merge uncontrolled summarization and controlled summarization

Browse files
Files changed (4) hide show
  1. app.py +14 -78
  2. controlled_summarization.py +58 -54
  3. description.py +50 -53
  4. requirements.txt +1 -1
app.py CHANGED
@@ -2,12 +2,10 @@ import gradio as gr
2
  from description import *
3
 
4
  from reference_string_parsing import *
5
- from summarization import *
6
  from controlled_summarization import *
7
 
8
  with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
9
- gr.Markdown("# SciAssist: Scientists' Assistant toolkit")
10
- gr.Markdown("SciAssist currently supports Reference String Parsing, uncontrolled Summarization and Controlled Summarization. Github repo: https://github.com/WING-NUS/SciAssist")
11
  with gr.Tabs():
12
  # Reference String Parsing
13
  with gr.TabItem("Reference String Parsing"):
@@ -61,102 +59,40 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
61
  outputs=rsp_str_output
62
  )
63
 
64
- # Single Document Summarization
65
- with gr.TabItem("Uncontrolled Summarization"):
66
- with gr.Box():
67
- gr.Markdown(ssum_str_md)
68
- with gr.Row():
69
- with gr.Column():
70
- ssum_str = gr.Textbox(label="Input String")
71
- # with gr.Column():
72
- # ssum_str_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
73
- # ssum_str_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
74
- with gr.Row():
75
- ssum_str_btn = gr.Button("Generate")
76
- ssum_str_output = gr.Textbox(
77
- elem_id="htext",
78
- label="Summary",
79
- )
80
- ssum_str_examples = gr.Examples(examples=[[ssum_str_example], ],
81
- inputs=[ssum_str])
82
- with gr.Box():
83
- gr.Markdown(ssum_file_md)
84
- with gr.Row():
85
- with gr.Column():
86
- ssum_file = gr.File(label="Input File")
87
- # with gr.Column():
88
- # ssum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
89
- # ssum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
90
- with gr.Row():
91
- ssum_file_btn = gr.Button("Generate")
92
- ssum_file_output = gr.Textbox(
93
- elem_id="htext",
94
- label="Summary",
95
- )
96
- ssum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt"],["examples/BERT_paper.pdf"]],
97
- inputs=[ssum_file])
98
-
99
- ssum_file_btn.click(
100
- fn=ssum_for_file,
101
- inputs=[ssum_file],
102
- outputs=ssum_file_output
103
- )
104
- ssum_str_btn.click(
105
- fn=ssum_for_str,
106
- inputs=[ssum_str],
107
- outputs=ssum_str_output
108
- )
109
-
110
  # Controlled Summarization
111
- with gr.TabItem("Controlled Summarization"):
112
- with gr.Box():
113
- gr.Markdown(ctrlsum_str_md)
114
- with gr.Row():
115
- with gr.Column():
116
- ctrlsum_str = gr.Textbox(label="Input String")
117
- with gr.Column():
118
- # ctrlsum_str_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
119
- # ctrlsum_str_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
120
- ctrlsum_str_length = gr.Slider(0, 300, step=50, label="Length")
121
- ctrlsum_str_keywords = gr.Textbox(label="Keywords")
122
- with gr.Row():
123
- ctrlsum_str_btn = gr.Button("Generate")
124
- ctrlsum_str_output = gr.Textbox(
125
- elem_id="htext",
126
- label="Summary",
127
- )
128
- ctrlsum_str_examples = gr.Examples(examples=[[ssum_str_example, 50, "BERT" ], ],
129
- inputs=[ctrlsum_str, ctrlsum_str_length, ctrlsum_str_keywords])
130
  with gr.Box():
131
  gr.Markdown(ctrlsum_file_md)
132
  with gr.Row():
133
  with gr.Column():
134
  ctrlsum_file = gr.File(label="Input File")
 
135
  with gr.Column():
 
136
  # ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
137
  # ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
138
  ctrlsum_file_length = gr.Slider(0,300,step=50, label="Length")
139
- ctrlsum_file_keywords = gr.Textbox(label="Keywords")
140
  with gr.Row():
141
  ctrlsum_file_btn = gr.Button("Generate")
142
  ctrlsum_file_output = gr.Textbox(
143
  elem_id="htext",
144
  label="Summary",
145
  )
146
- ctrlsum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt", 100, ""],["examples/BERT_paper.pdf", 0, "BERT"]],
147
  inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords])
148
 
149
  ctrlsum_file_btn.click(
150
  fn=ctrlsum_for_file,
151
- inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords],
152
- outputs=ctrlsum_file_output
153
- )
154
- ctrlsum_str_btn.click(
155
- fn=ctrlsum_for_str,
156
- inputs=[ctrlsum_str, ctrlsum_str_length, ctrlsum_str_keywords],
157
- outputs=ctrlsum_str_output
158
  )
 
 
 
 
159
 
160
 
161
 
162
- demo.launch(share=False)
 
2
  from description import *
3
 
4
  from reference_string_parsing import *
 
5
  from controlled_summarization import *
6
 
7
  with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
8
+ gr.Markdown("# Gradio Demo for SciAssist")
 
9
  with gr.Tabs():
10
  # Reference String Parsing
11
  with gr.TabItem("Reference String Parsing"):
 
59
  outputs=rsp_str_output
60
  )
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  # Controlled Summarization
63
+ with gr.TabItem("Summarization"):
64
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  with gr.Box():
66
  gr.Markdown(ctrlsum_file_md)
67
  with gr.Row():
68
  with gr.Column():
69
  ctrlsum_file = gr.File(label="Input File")
70
+ ctrlsum_str = gr.TextArea(label="Input String")
71
  with gr.Column():
72
+ gr.Markdown("* Length 0 will exert no control over length.")
73
  # ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
74
  # ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
75
  ctrlsum_file_length = gr.Slider(0,300,step=50, label="Length")
76
+ ctrlsum_file_keywords = gr.Textbox(label="Keywords",max_lines=1)
77
  with gr.Row():
78
  ctrlsum_file_btn = gr.Button("Generate")
79
  ctrlsum_file_output = gr.Textbox(
80
  elem_id="htext",
81
  label="Summary",
82
  )
83
+ ctrlsum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt", 100, "", ""],["examples/BERT_paper.pdf", 0, "BERT"]],
84
  inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords])
85
 
86
  ctrlsum_file_btn.click(
87
  fn=ctrlsum_for_file,
88
+ inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str],
89
+ outputs=[ctrlsum_file_output, ctrlsum_str]
 
 
 
 
 
90
  )
91
+ def clear():
92
+ return None,0,None
93
+
94
+ ctrlsum_file.change(clear, inputs=None,outputs=[ctrlsum_str,ctrlsum_file_length,ctrlsum_file_keywords])
95
 
96
 
97
 
98
+ demo.launch(share=True)
controlled_summarization.py CHANGED
@@ -1,55 +1,59 @@
1
- from typing import List, Tuple
2
- import torch
3
- from SciAssist import Summarization
4
-
5
- device = "gpu" if torch.cuda.is_available() else "cpu"
6
-
7
- ctrlsum_pipeline = Summarization(os_name="nt",checkpoint="google/flan-t5-base")
8
-
9
-
10
- def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
11
-
12
- if keywords is not None:
13
- keywords = keywords.strip().split(",")
14
- if keywords[0] == "":
15
- keywords = None
16
- if length==0 or length is None:
17
- length = None
18
- results = ctrlsum_pipeline.predict(input, type="str",
19
- length=length, keywords=keywords)
20
-
21
- output = []
22
- for res in results["summary"]:
23
- output.append(f"{res}\n\n")
24
- return "".join(output)
25
-
26
-
27
- def ctrlsum_for_file(input, length=None, keywords=None) -> List[Tuple[str, str]]:
28
- if input == None:
29
- return None
30
- filename = input.name
31
- if keywords is not None:
32
- keywords = keywords.strip().split(",")
33
- if keywords[0] == "":
34
- keywords = None
35
- if length==0:
36
- length = None
37
- # Identify the format of input and parse reference strings
38
- if filename[-4:] == ".txt":
39
- results = ctrlsum_pipeline.predict(filename, type="txt",
40
- save_results=False,
41
- length=length, keywords=keywords)
42
- elif filename[-4:] == ".pdf":
43
- results = ctrlsum_pipeline.predict(filename,
44
- save_results=False, length=length, keywords=keywords)
45
- else:
46
- return [("File Format Error !", None)]
47
-
48
- output = []
49
- for res in results["summary"]:
50
- output.append(f"{res}\n\n")
51
- return "".join(output)
52
-
53
-
54
-
 
 
 
 
55
  ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ from SciAssist import Summarization
4
+
5
+ device = "gpu" if torch.cuda.is_available() else "cpu"
6
+
7
+ ctrlsum_pipeline = Summarization(os_name="nt",checkpoint="google/flan-t5-base")
8
+
9
+
10
+ def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
11
+
12
+ if keywords is not None:
13
+ keywords = keywords.strip().split(",")
14
+ if keywords[0] == "":
15
+ keywords = None
16
+ if length==0 or length is None:
17
+ length = None
18
+ results = ctrlsum_pipeline.predict(input, type="str",
19
+ length=length, keywords=keywords)
20
+
21
+ output = []
22
+ for res in results["summary"]:
23
+ output.append(f"{res}\n\n")
24
+ return "".join(output)
25
+
26
+
27
+ def ctrlsum_for_file(input, length=None, keywords=None, text="") -> List[Tuple[str, str]]:
28
+ if input == None:
29
+ if text=="":
30
+ return None
31
+ else:
32
+ return ctrlsum_for_str(text,length,keywords),text
33
+ else:
34
+ filename = input.name
35
+ if keywords is not None:
36
+ keywords = keywords.strip().split(",")
37
+ if keywords[0] == "":
38
+ keywords = None
39
+ if length==0:
40
+ length = None
41
+ # Identify the format of input and parse reference strings
42
+ if filename[-4:] == ".txt":
43
+ results = ctrlsum_pipeline.predict(filename, type="txt",
44
+ save_results=False,
45
+ length=length, keywords=keywords)
46
+ elif filename[-4:] == ".pdf":
47
+ results = ctrlsum_pipeline.predict(filename,
48
+ save_results=False, length=length, keywords=keywords)
49
+ else:
50
+ return [("File Format Error !", None)]
51
+
52
+ output = []
53
+ for res in results["summary"]:
54
+ output.append(f"{res}\n\n")
55
+ return "".join(output), results["raw_text"]
56
+
57
+
58
+
59
  ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
description.py CHANGED
@@ -1,54 +1,51 @@
1
- # Reference string parsing Markdown
2
- rsp_str_md = '''
3
- To **test on strings**, simply input one or more strings.
4
- '''
5
-
6
- rsp_file_md = '''
7
- To **test on a file**, the input can be:
8
-
9
- - A txt file which contains a reference string in each line.
10
-
11
- - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
12
-
13
- '''
14
- # - A pdf file which contains a whole scientific document without any processing (including title, author...).
15
-
16
- ssum_str_md = '''
17
- To **test on strings**, simply input a string.
18
-
19
- '''
20
-
21
- ssum_file_md = '''
22
- To **test on a file**, the input can be:
23
-
24
- - A txt file which contains the content to be summarized.
25
-
26
- - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
27
-
28
-
29
- '''
30
-
31
- # - The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
32
- ctrlsum_str_md = '''
33
- To **test on strings**, simply input a string.
34
-
35
- **Note**:
36
-
37
- - Length 0 will exert no control over length.
38
-
39
-
40
- '''
41
-
42
- ctrlsum_file_md = '''
43
- To **test on a file**, the input can be:
44
-
45
- - A txt file which contains the content to be summarized.
46
-
47
- - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
48
-
49
- **Note**:
50
-
51
- - Length 0 will exert no control over length.
52
-
53
-
54
  '''
 
1
+ # Reference string parsing Markdown
2
+ rsp_str_md = '''
3
+ To **test on strings**, simply input one or more strings.
4
+ '''
5
+
6
+ rsp_file_md = '''
7
+ To **test on a file**, the input can be:
8
+
9
+ - A txt file which contains a reference string in each line.
10
+
11
+ - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
12
+
13
+ '''
14
+ # - A pdf file which contains a whole scientific document without any processing (including title, author...).
15
+
16
+ ssum_str_md = '''
17
+ To **test on strings**, simply input a string.
18
+
19
+ '''
20
+
21
+ ssum_file_md = '''
22
+ To **test on a file**, the input can be:
23
+
24
+ - A txt file which contains the content to be summarized.
25
+
26
+ - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
27
+
28
+
29
+ '''
30
+
31
+ # - The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
32
+ ctrlsum_str_md = '''
33
+ To **test on strings**, simply input a string.
34
+
35
+ **Note**:
36
+
37
+ - Length 0 will exert no control over length.
38
+
39
+
40
+ '''
41
+
42
+ ctrlsum_file_md = '''
43
+ To **test on a file**, the input can be:
44
+
45
+ - A txt file which contains the content to be summarized.
46
+
47
+ - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
48
+
49
+
50
+
 
 
 
51
  '''
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
  torch==1.12.0
2
- SciAssist==0.0.27
 
1
  torch==1.12.0
2
+ SciAssist==0.0.28