wing-nus dyxohjl666 commited on
Commit
086fdba
β€’
1 Parent(s): e75148e

Add controlled summarization (#3)

Browse files

- Add controlled summarization (b723b598f051adf56e95b16e90992eaee5dca0df)
- Delete unimportant files (387bd94d1e8d8c6d587955cb6bde7fdc6495b2f7)


Co-authored-by: Yixi Ding <dyxohjl666@users.noreply.huggingface.co>

README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: Test Sciassist
3
- emoji: πŸš€
4
- colorFrom: red
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 3.4
8
- app_file: app.py
9
- pinned: false
10
- license: afl-3.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Test Sciassist
3
+ emoji: πŸš€
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 3.4
8
+ app_file: app.py
9
+ pinned: false
10
+ license: afl-3.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,111 +1,161 @@
1
- import gradio as gr
2
- from description import *
3
-
4
- from reference_string_parsing import *
5
- from summarization import *
6
-
7
- with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
8
- gr.Markdown("# Gradio Demo for SciAssist")
9
- with gr.Tabs():
10
- # Reference String Parsing
11
- with gr.TabItem("Reference String Parsing"):
12
- with gr.Box():
13
- gr.Markdown(rsp_str_md)
14
- with gr.Row():
15
- with gr.Column():
16
- rsp_str = gr.Textbox(label="Input String")
17
- with gr.Column():
18
- rsp_str_dehyphen = gr.Checkbox(label="dehyphen")
19
- with gr.Row():
20
- rsp_str_btn = gr.Button("Parse")
21
- rsp_str_output = gr.HighlightedText(
22
- elem_id="htext",
23
- label="The Result of Parsing",
24
- combine_adjacent=True,
25
- adjacent_separator=" ",
26
- )
27
- rsp_str_examples = gr.Examples(examples=[[
28
- "Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).",
29
- True],
30
- [
31
- "Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval-2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).",
32
- False]], inputs=[rsp_str, rsp_str_dehyphen])
33
- with gr.Box():
34
- gr.Markdown(rsp_file_md)
35
- with gr.Row():
36
- with gr.Column():
37
- rsp_file = gr.File(label="Input File")
38
- rsp_file_dehyphen = gr.Checkbox(label="dehyphen")
39
- with gr.Row():
40
- rsp_file_btn = gr.Button("Parse")
41
-
42
- rsp_file_output = gr.HighlightedText(
43
- elem_id="htext",
44
- label="The Result of Parsing",
45
- combine_adjacent=True,
46
- adjacent_separator=" ",
47
- )
48
- rsp_file_examples = gr.Examples(examples=[["examples/N18-3011_ref.txt", False],["examples/BERT_paper.pdf", True]], inputs=[rsp_file, rsp_file_dehyphen])
49
-
50
-
51
- rsp_file_btn.click(
52
- fn=rsp_for_file,
53
- inputs=[rsp_file, rsp_file_dehyphen],
54
- outputs=rsp_file_output
55
- )
56
- rsp_str_btn.click(
57
- fn=rsp_for_str,
58
- inputs=[rsp_str, rsp_str_dehyphen],
59
- outputs=rsp_str_output
60
- )
61
-
62
- # Single Document Summarization
63
- with gr.TabItem("Single Document Summarization"):
64
- with gr.Box():
65
- gr.Markdown(ssum_str_md)
66
- with gr.Row():
67
- with gr.Column():
68
- ssum_str = gr.Textbox(label="Input String")
69
- with gr.Column():
70
- ssum_str_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
71
- ssum_str_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
72
- with gr.Row():
73
- ssum_str_btn = gr.Button("Generate")
74
- ssum_str_output = gr.Textbox(
75
- elem_id="htext",
76
- label="Summary",
77
- )
78
- ssum_str_examples = gr.Examples(examples=[[ssum_str_example, 1, 1], ],
79
- inputs=[ssum_str, ssum_str_beams, ssum_str_sequences])
80
- with gr.Box():
81
- gr.Markdown(ssum_file_md)
82
- with gr.Row():
83
- with gr.Column():
84
- ssum_file = gr.File(label="Input File")
85
- with gr.Column():
86
- ssum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
87
- ssum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
88
- with gr.Row():
89
- ssum_file_btn = gr.Button("Generate")
90
- ssum_file_output = gr.Textbox(
91
- elem_id="htext",
92
- label="Summary",
93
- )
94
- ssum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt", 10, 2],["examples/BERT_paper.pdf", 1, 1]],
95
- inputs=[ssum_file, ssum_file_beams, ssum_file_sequences])
96
-
97
- ssum_file_btn.click(
98
- fn=ssum_for_file,
99
- inputs=[ssum_file, ssum_file_beams, ssum_file_sequences],
100
- outputs=ssum_file_output
101
- )
102
- ssum_str_btn.click(
103
- fn=ssum_for_str,
104
- inputs=[ssum_str, ssum_str_beams, ssum_str_sequences],
105
- outputs=ssum_str_output
106
- )
107
-
108
-
109
-
110
-
111
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from description import *
3
+
4
+ from reference_string_parsing import *
5
+ from summarization import *
6
+ from controlled_summarization import *
7
+
8
+ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
9
+ gr.Markdown("# Gradio Demo for SciAssist")
10
+ with gr.Tabs():
11
+ # Reference String Parsing
12
+ with gr.TabItem("Reference String Parsing"):
13
+ with gr.Box():
14
+ gr.Markdown(rsp_str_md)
15
+ with gr.Row():
16
+ with gr.Column():
17
+ rsp_str = gr.Textbox(label="Input String")
18
+ with gr.Column():
19
+ rsp_str_dehyphen = gr.Checkbox(label="dehyphen")
20
+ with gr.Row():
21
+ rsp_str_btn = gr.Button("Parse")
22
+ rsp_str_output = gr.HighlightedText(
23
+ elem_id="htext",
24
+ label="The Result of Parsing",
25
+ combine_adjacent=True,
26
+ adjacent_separator=" ",
27
+ )
28
+ rsp_str_examples = gr.Examples(examples=[[
29
+ "Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).",
30
+ True],
31
+ [
32
+ "Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval-2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).",
33
+ False]], inputs=[rsp_str, rsp_str_dehyphen])
34
+ with gr.Box():
35
+ gr.Markdown(rsp_file_md)
36
+ with gr.Row():
37
+ with gr.Column():
38
+ rsp_file = gr.File(label="Input File")
39
+ rsp_file_dehyphen = gr.Checkbox(label="dehyphen")
40
+ with gr.Row():
41
+ rsp_file_btn = gr.Button("Parse")
42
+
43
+ rsp_file_output = gr.HighlightedText(
44
+ elem_id="htext",
45
+ label="The Result of Parsing",
46
+ combine_adjacent=True,
47
+ adjacent_separator=" ",
48
+ )
49
+ rsp_file_examples = gr.Examples(examples=[["examples/N18-3011_ref.txt", False],["examples/BERT_paper.pdf", True]], inputs=[rsp_file, rsp_file_dehyphen])
50
+
51
+
52
+ rsp_file_btn.click(
53
+ fn=rsp_for_file,
54
+ inputs=[rsp_file, rsp_file_dehyphen],
55
+ outputs=rsp_file_output
56
+ )
57
+ rsp_str_btn.click(
58
+ fn=rsp_for_str,
59
+ inputs=[rsp_str, rsp_str_dehyphen],
60
+ outputs=rsp_str_output
61
+ )
62
+
63
+ # Single Document Summarization
64
+ with gr.TabItem("Summarization"):
65
+ with gr.Box():
66
+ gr.Markdown(ssum_str_md)
67
+ with gr.Row():
68
+ with gr.Column():
69
+ ssum_str = gr.Textbox(label="Input String")
70
+ # with gr.Column():
71
+ # ssum_str_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
72
+ # ssum_str_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
73
+ with gr.Row():
74
+ ssum_str_btn = gr.Button("Generate")
75
+ ssum_str_output = gr.Textbox(
76
+ elem_id="htext",
77
+ label="Summary",
78
+ )
79
+ ssum_str_examples = gr.Examples(examples=[[ssum_str_example], ],
80
+ inputs=[ssum_str])
81
+ with gr.Box():
82
+ gr.Markdown(ssum_file_md)
83
+ with gr.Row():
84
+ with gr.Column():
85
+ ssum_file = gr.File(label="Input File")
86
+ # with gr.Column():
87
+ # ssum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
88
+ # ssum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
89
+ with gr.Row():
90
+ ssum_file_btn = gr.Button("Generate")
91
+ ssum_file_output = gr.Textbox(
92
+ elem_id="htext",
93
+ label="Summary",
94
+ )
95
+ ssum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt"],["examples/BERT_paper.pdf"]],
96
+ inputs=[ssum_file])
97
+
98
+ ssum_file_btn.click(
99
+ fn=ssum_for_file,
100
+ inputs=[ssum_file],
101
+ outputs=ssum_file_output
102
+ )
103
+ ssum_str_btn.click(
104
+ fn=ssum_for_str,
105
+ inputs=[ssum_str],
106
+ outputs=ssum_str_output
107
+ )
108
+
109
+ # Controlled Summarization
110
+ with gr.TabItem("Controlled Summarization"):
111
+ with gr.Box():
112
+ gr.Markdown(ctrlsum_str_md)
113
+ with gr.Row():
114
+ with gr.Column():
115
+ ctrlsum_str = gr.Textbox(label="Input String")
116
+ with gr.Column():
117
+ # ctrlsum_str_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
118
+ # ctrlsum_str_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
119
+ ctrlsum_str_length = gr.Slider(0, 300, step=50, label="Length")
120
+ ctrlsum_str_keywords = gr.Textbox(label="Keywords")
121
+ with gr.Row():
122
+ ctrlsum_str_btn = gr.Button("Generate")
123
+ ctrlsum_str_output = gr.Textbox(
124
+ elem_id="htext",
125
+ label="Summary",
126
+ )
127
+ ctrlsum_str_examples = gr.Examples(examples=[[ssum_str_example, 50, "BERT" ], ],
128
+ inputs=[ctrlsum_str, ctrlsum_str_length, ctrlsum_str_keywords])
129
+ with gr.Box():
130
+ gr.Markdown(ctrlsum_file_md)
131
+ with gr.Row():
132
+ with gr.Column():
133
+ ctrlsum_file = gr.File(label="Input File")
134
+ with gr.Column():
135
+ # ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
136
+ # ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
137
+ ctrlsum_file_length = gr.Slider(0,300,step=50, label="Length")
138
+ ctrlsum_file_keywords = gr.Textbox(label="Keywords")
139
+ with gr.Row():
140
+ ctrlsum_file_btn = gr.Button("Generate")
141
+ ctrlsum_file_output = gr.Textbox(
142
+ elem_id="htext",
143
+ label="Summary",
144
+ )
145
+ ctrlsum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt", 100, ""],["examples/BERT_paper.pdf", 0, "BERT"]],
146
+ inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords])
147
+
148
+ ctrlsum_file_btn.click(
149
+ fn=ctrlsum_for_file,
150
+ inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords],
151
+ outputs=ctrlsum_file_output
152
+ )
153
+ ctrlsum_str_btn.click(
154
+ fn=ctrlsum_for_str,
155
+ inputs=[ctrlsum_str, ctrlsum_str_length, ctrlsum_str_keywords],
156
+ outputs=ctrlsum_str_output
157
+ )
158
+
159
+
160
+
161
+ demo.launch(share=True)
bart-large-cnn-e5.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d4aab21eb3b88c4978c54a03214da478828b672d60bff3b0cf8fdfb646f4d66
3
- size 1625559041
 
 
 
 
controlled_summarization.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ from SciAssist import Summarization
4
+
5
+ device = "gpu" if torch.cuda.is_available() else "cpu"
6
+
7
+ ctrlsum_pipeline = Summarization(os_name="nt",checkpoint="google/flan-t5-base")
8
+
9
+
10
+ def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
11
+
12
+ if keywords is not None:
13
+ keywords = keywords.strip().split(",")
14
+ if keywords[0] == "":
15
+ keywords = None
16
+ if length==0 or length is None:
17
+ length = None
18
+ results = ctrlsum_pipeline.predict(input, type="str",
19
+ length=length, keywords=keywords)
20
+
21
+ output = []
22
+ for res in results["summary"]:
23
+ output.append(f"{res}\n\n")
24
+ return "".join(output)
25
+
26
+
27
+ def ctrlsum_for_file(input, length=None, keywords=None) -> List[Tuple[str, str]]:
28
+ if input == None:
29
+ return None
30
+ filename = input.name
31
+ if keywords is not None:
32
+ keywords = keywords.strip().split(",")
33
+ if keywords[0] == "":
34
+ keywords = None
35
+ if length==0:
36
+ length = None
37
+ # Identify the format of input and parse reference strings
38
+ if filename[-4:] == ".txt":
39
+ results = ctrlsum_pipeline.predict(filename, type="txt",
40
+ save_results=False,
41
+ length=length, keywords=keywords)
42
+ elif filename[-4:] == ".pdf":
43
+ results = ctrlsum_pipeline.predict(filename,
44
+ save_results=False, length=length, keywords=keywords)
45
+ else:
46
+ return [("File Format Error !", None)]
47
+
48
+ output = []
49
+ for res in results["summary"]:
50
+ output.append(f"{res}\n\n")
51
+ return "".join(output)
52
+
53
+
54
+
55
+ ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : β€’ We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . β€’ We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . β€’ BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
description.py CHANGED
@@ -1,33 +1,54 @@
1
- # Reference string parsing Markdown
2
- rsp_str_md = '''
3
- To **test on strings**, simply input one or more strings.
4
- '''
5
-
6
- rsp_file_md = '''
7
- To **test on a file**, the input can be:
8
-
9
- - A txt file which contains a reference string in each line.
10
-
11
- - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
12
-
13
- '''
14
- # - A pdf file which contains a whole scientific document without any processing (including title, author...).
15
-
16
- ssum_str_md = '''
17
- To **test on strings**, simply input a string.
18
-
19
- **Note**: The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
20
-
21
- '''
22
-
23
- ssum_file_md = '''
24
- To **test on a file**, the input can be:
25
-
26
- - A txt file which contains the content to be summarized.
27
-
28
- - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
29
-
30
-
31
- **Note**: The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
32
-
33
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reference string parsing Markdown
2
+ rsp_str_md = '''
3
+ To **test on strings**, simply input one or more strings.
4
+ '''
5
+
6
+ rsp_file_md = '''
7
+ To **test on a file**, the input can be:
8
+
9
+ - A txt file which contains a reference string in each line.
10
+
11
+ - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
12
+
13
+ '''
14
+ # - A pdf file which contains a whole scientific document without any processing (including title, author...).
15
+
16
+ ssum_str_md = '''
17
+ To **test on strings**, simply input a string.
18
+
19
+ '''
20
+
21
+ ssum_file_md = '''
22
+ To **test on a file**, the input can be:
23
+
24
+ - A txt file which contains the content to be summarized.
25
+
26
+ - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
27
+
28
+
29
+ '''
30
+
31
+ # - The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
32
+ ctrlsum_str_md = '''
33
+ To **test on strings**, simply input a string.
34
+
35
+ **Note**:
36
+
37
+ - Length 0 will exert no control over length.
38
+
39
+
40
+ '''
41
+
42
+ ctrlsum_file_md = '''
43
+ To **test on a file**, the input can be:
44
+
45
+ - A txt file which contains the content to be summarized.
46
+
47
+ - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
48
+
49
+ **Note**:
50
+
51
+ - Length 0 will exert no control over length.
52
+
53
+
54
+ '''
examples/BERT - Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf ADDED
Binary file (775 kB). View file
 
reference_string_parsing.py CHANGED
@@ -1,36 +1,36 @@
1
- from typing import List, Tuple
2
- import torch
3
- from SciAssist import ReferenceStringParsing
4
-
5
- device = "gpu" if torch.cuda.is_available() else "cpu"
6
- rsp_pipeline = ReferenceStringParsing(os_name="nt")
7
-
8
-
9
- def rsp_for_str(input, dehyphen=False) -> List[Tuple[str, str]]:
10
- results = rsp_pipeline.predict(input, type="str", dehyphen=dehyphen)
11
- output = []
12
- for res in results:
13
- for token, tag in zip(res["tokens"], res["tags"]):
14
- output.append((token, tag))
15
- output.append(("\n\n", None))
16
- return output
17
-
18
-
19
- def rsp_for_file(input, dehyphen=False) -> List[Tuple[str, str]]:
20
- if input == None:
21
- return None
22
- filename = input.name
23
- # Identify the format of input and parse reference strings
24
- if filename[-4:] == ".txt":
25
- results = rsp_pipeline.predict(filename, type="txt", dehyphen=dehyphen, save_results=False)
26
- elif filename[-4:] == ".pdf":
27
- results = rsp_pipeline.predict(filename, dehyphen=dehyphen, save_results=False)
28
- else:
29
- return [("File Format Error !", None)]
30
- # Prepare for the input gradio.HighlightedText accepts.
31
- output = []
32
- for res in results:
33
- for token, tag in zip(res["tokens"], res["tags"]):
34
- output.append((token, tag))
35
- output.append(("\n\n", None))
36
- return output
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ from SciAssist import ReferenceStringParsing
4
+
5
+ device = "gpu" if torch.cuda.is_available() else "cpu"
6
+ rsp_pipeline = ReferenceStringParsing(os_name="nt")
7
+
8
+
9
+ def rsp_for_str(input, dehyphen=False) -> List[Tuple[str, str]]:
10
+ results = rsp_pipeline.predict(input, type="str", dehyphen=dehyphen)
11
+ output = []
12
+ for res in results:
13
+ for token, tag in zip(res["tokens"], res["tags"]):
14
+ output.append((token, tag))
15
+ output.append(("\n\n", None))
16
+ return output
17
+
18
+
19
+ def rsp_for_file(input, dehyphen=False) -> List[Tuple[str, str]]:
20
+ if input == None:
21
+ return None
22
+ filename = input.name
23
+ # Identify the format of input and parse reference strings
24
+ if filename[-4:] == ".txt":
25
+ results = rsp_pipeline.predict(filename, type="txt", dehyphen=dehyphen, save_results=False)
26
+ elif filename[-4:] == ".pdf":
27
+ results = rsp_pipeline.predict(filename, dehyphen=dehyphen, save_results=False)
28
+ else:
29
+ return [("File Format Error !", None)]
30
+ # Prepare for the input gradio.HighlightedText accepts.
31
+ output = []
32
+ for res in results:
33
+ for token, tag in zip(res["tokens"], res["tags"]):
34
+ output.append((token, tag))
35
+ output.append(("\n\n", None))
36
+ return output
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
- torch==1.12.0
2
- SciAssist==0.0.22
 
1
+ torch==1.12.0
2
+ SciAssist==0.0.24
summarization.py CHANGED
@@ -1,37 +1,37 @@
1
- from typing import List, Tuple
2
- import torch
3
- from SciAssist import Summarization
4
-
5
- device = "gpu" if torch.cuda.is_available() else "cpu"
6
- ssum_pipeline = Summarization(os_name="nt")
7
-
8
-
9
- def ssum_for_str(input, num_beams=1, num_return_sequences=1) -> List[Tuple[str, str]]:
10
- results = ssum_pipeline.predict(input, type="str", num_beams=num_beams, num_return_sequences=num_return_sequences)
11
-
12
- output = []
13
- for res in results["summary"]:
14
- output.append(f"{res}\n\n")
15
- return "".join(output)
16
-
17
-
18
- def ssum_for_file(input, num_beams=1, num_return_sequences=1) -> List[Tuple[str, str]]:
19
- if input == None:
20
- return None
21
- filename = input.name
22
- # Identify the format of input and parse reference strings
23
- if filename[-4:] == ".txt":
24
- results = ssum_pipeline.predict(filename, type="txt", num_beams=num_beams,
25
- num_return_sequences=num_return_sequences, save_results=False)
26
- elif filename[-4:] == ".pdf":
27
- results = ssum_pipeline.predict(filename, num_beams=num_beams, num_return_sequences=num_return_sequences, save_results=False)
28
- else:
29
- return [("File Format Error !", None)]
30
-
31
- output = []
32
- for res in results["summary"]:
33
- output.append(f"{res}\n\n")
34
- return "".join(output)
35
-
36
-
37
  ssum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : β€’ We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . β€’ We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . β€’ BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ from SciAssist import Summarization
4
+
5
+ device = "gpu" if torch.cuda.is_available() else "cpu"
6
+ ssum_pipeline = Summarization(os_name="nt", checkpoint="google/flan-t5-base")
7
+
8
+
9
+ def ssum_for_str(input) -> List[Tuple[str, str]]:
10
+ results = ssum_pipeline.predict(input, type="str")
11
+
12
+ output = []
13
+ for res in results["summary"]:
14
+ output.append(f"{res}\n\n")
15
+ return "".join(output)
16
+
17
+
18
+ def ssum_for_file(input) -> List[Tuple[str, str]]:
19
+ if input == None:
20
+ return None
21
+ filename = input.name
22
+ # Identify the format of input and parse reference strings
23
+ if filename[-4:] == ".txt":
24
+ results = ssum_pipeline.predict(filename, type="txt",
25
+ save_results=False)
26
+ elif filename[-4:] == ".pdf":
27
+ results = ssum_pipeline.predict(filename, save_results=False)
28
+ else:
29
+ return [("File Format Error !", None)]
30
+
31
+ output = []
32
+ for res in results["summary"]:
33
+ output.append(f"{res}\n\n")
34
+ return "".join(output)
35
+
36
+
37
  ssum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : β€’ We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . β€’ We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . β€’ BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "