wing-nus commited on
Commit
420f561
1 Parent(s): 6a19e45

add ref parser and summarizer

Browse files

reference string parsing and summarization demos

Files changed (5) hide show
  1. app.py +111 -0
  2. description.py +30 -0
  3. reference_string_parsing.py +36 -0
  4. requirements.txt +2 -0
  5. summarization.py +37 -0
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from description import *
3
+
4
+ from reference_string_parsing import *
5
+ from summarization import *
6
+
7
+ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
8
+ gr.Markdown("# Gradio Demo for SciAssist")
9
+ with gr.Tabs():
10
+ # Reference String Parsing
11
+ with gr.TabItem("Reference String Parsing"):
12
+ with gr.Box():
13
+ gr.Markdown(rsp_str_md)
14
+ with gr.Row():
15
+ with gr.Column():
16
+ rsp_str = gr.Textbox(label="Input String")
17
+ with gr.Column():
18
+ rsp_str_dehyphen = gr.Checkbox(label="dehyphen")
19
+ with gr.Row():
20
+ rsp_str_btn = gr.Button("Parse")
21
+ rsp_str_output = gr.HighlightedText(
22
+ elem_id="htext",
23
+ label="The Result of Parsing",
24
+ combine_adjacent=True,
25
+ adjacent_separator=" ",
26
+ )
27
+ rsp_str_examples = gr.Examples(examples=[[
28
+ "Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).",
29
+ True],
30
+ [
31
+ "Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval-2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).",
32
+ False]], inputs=[rsp_str, rsp_str_dehyphen])
33
+ with gr.Box():
34
+ gr.Markdown(rsp_file_md)
35
+ with gr.Row():
36
+ with gr.Column():
37
+ rsp_file = gr.File(label="Input File")
38
+ rsp_file_dehyphen = gr.Checkbox(label="dehyphen")
39
+ with gr.Row():
40
+ rsp_file_btn = gr.Button("Parse")
41
+
42
+ rsp_file_output = gr.HighlightedText(
43
+ elem_id="htext",
44
+ label="The Result of Parsing",
45
+ combine_adjacent=True,
46
+ adjacent_separator=" ",
47
+ )
48
+ rsp_file_examples = gr.Examples(examples=[["examples/N18-3011_ref.txt", False],], inputs=[rsp_file, rsp_file_dehyphen])
49
+
50
+
51
+ rsp_file_btn.click(
52
+ fn=rsp_for_file,
53
+ inputs=[rsp_file, rsp_file_dehyphen],
54
+ outputs=rsp_file_output
55
+ )
56
+ rsp_str_btn.click(
57
+ fn=rsp_for_str,
58
+ inputs=[rsp_str, rsp_str_dehyphen],
59
+ outputs=rsp_str_output
60
+ )
61
+
62
+ # Single Document Summarization
63
+ with gr.TabItem("Single Document Summarization"):
64
+ with gr.Box():
65
+ gr.Markdown(ssum_str_md)
66
+ with gr.Row():
67
+ with gr.Column():
68
+ ssum_str = gr.Textbox(label="Input String")
69
+ with gr.Column():
70
+ ssum_str_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
71
+ ssum_str_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
72
+ with gr.Row():
73
+ ssum_str_btn = gr.Button("Generate")
74
+ ssum_str_output = gr.Textbox(
75
+ elem_id="htext",
76
+ label="Summary",
77
+ )
78
+ ssum_str_examples = gr.Examples(examples=[[ssum_str_example, 1, 1], ],
79
+ inputs=[ssum_str, ssum_str_beams, ssum_str_sequences])
80
+ with gr.Box():
81
+ gr.Markdown(ssum_file_md)
82
+ with gr.Row():
83
+ with gr.Column():
84
+ ssum_file = gr.File(label="Input File")
85
+ with gr.Column():
86
+ ssum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
87
+ ssum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
88
+ with gr.Row():
89
+ ssum_file_btn = gr.Button("Generate")
90
+ ssum_file_output = gr.Textbox(
91
+ elem_id="htext",
92
+ label="Summary",
93
+ )
94
+ ssum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt", 10, 2],],
95
+ inputs=[ssum_file, ssum_file_beams, ssum_file_sequences])
96
+
97
+ ssum_file_btn.click(
98
+ fn=ssum_for_file,
99
+ inputs=[ssum_file, ssum_file_beams, ssum_file_sequences],
100
+ outputs=ssum_file_output
101
+ )
102
+ ssum_str_btn.click(
103
+ fn=ssum_for_str,
104
+ inputs=[ssum_str, ssum_str_beams, ssum_str_sequences],
105
+ outputs=ssum_str_output
106
+ )
107
+
108
+
109
+
110
+
111
+ demo.launch()
description.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reference string parsing Markdown
2
+ rsp_str_md = '''
3
+ To **test on strings**, simply input one or more strings.
4
+ '''
5
+
6
+ rsp_file_md = '''
7
+ To **test on a file**, the input can be:
8
+
9
+ - A txt file which contains a reference string in each line.
10
+
11
+
12
+ '''
13
+ # - A pdf file which contains a whole scientific document without any processing (including title, author...).
14
+
15
+ ssum_str_md = '''
16
+ To **test on strings**, simply input a string.
17
+
18
+ **Note**: The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
19
+
20
+ '''
21
+
22
+ ssum_file_md = '''
23
+ To **test on a file**, the input can be:
24
+
25
+ - A txt file which contains the content to be summarized.
26
+
27
+ **Note**: The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
28
+
29
+ '''
30
+ # - A pdf file which contains a whole scientific document without any processing (including title, author...).
reference_string_parsing.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ from SciAssist import ReferenceStringParsing
4
+
5
+ device = "gpu" if torch.cuda.is_available() else "cpu"
6
+ rsp_pipeline = ReferenceStringParsing()
7
+
8
+
9
+ def rsp_for_str(input, dehyphen=False) -> List[Tuple[str, str]]:
10
+ results = rsp_pipeline.predict(input, type="str", dehyphen=dehyphen)
11
+ output = []
12
+ for res in results:
13
+ for token, tag in zip(res["tokens"], res["tags"]):
14
+ output.append((token, tag))
15
+ output.append(("\n\n", None))
16
+ return output
17
+
18
+
19
+ def rsp_for_file(input, dehyphen=False) -> List[Tuple[str, str]]:
20
+ if input == None:
21
+ return None
22
+ filename = input.name
23
+ # Identify the format of input and parse reference strings
24
+ if filename[-4:] == ".txt":
25
+ results = rsp_pipeline.predict(filename, type="txt", dehyphen=dehyphen)
26
+ # elif filename[-4:] == ".pdf":
27
+ # results = rsp_pipeline.predict(filename, dehyphen=dehyphen)
28
+ else:
29
+ return [("File Format Error !", None)]
30
+ # Prepare for the input gradio.HighlightedText accepts.
31
+ output = []
32
+ for res in results:
33
+ for token, tag in zip(res["tokens"], res["tags"]):
34
+ output.append((token, tag))
35
+ output.append(("\n\n", None))
36
+ return output
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch==1.12.0
2
+ SciAssist==0.0.18
summarization.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ from SciAssist import Summarization
4
+
5
+ device = "gpu" if torch.cuda.is_available() else "cpu"
6
+ ssum_pipeline = Summarization()
7
+
8
+
9
+ def ssum_for_str(input, num_beams=1, num_return_sequences=1) -> List[Tuple[str, str]]:
10
+ results = ssum_pipeline.predict(input, type="str", num_beams=num_beams, num_return_sequences=num_return_sequences)
11
+
12
+ output = []
13
+ for res in results["summary"]:
14
+ output.append(f"{res}\n\n")
15
+ return "".join(output)
16
+
17
+
18
+ def ssum_for_file(input, num_beams=1, num_return_sequences=1) -> List[Tuple[str, str]]:
19
+ if input == None:
20
+ return None
21
+ filename = input.name
22
+ # Identify the format of input and parse reference strings
23
+ if filename[-4:] == ".txt":
24
+ results = ssum_pipeline.predict(filename, type="txt", num_beams=num_beams,
25
+ num_return_sequences=num_return_sequences, save_results=False)
26
+ # elif filename[-4:] == ".pdf":
27
+ # results = rsp_pipeline.predict(filename, num_beams=num_beams, num_return_sequences=num_return_sequences)
28
+ else:
29
+ return [("File Format Error !", None)]
30
+
31
+ output = []
32
+ for res in results["summary"]:
33
+ output.append(f"{res}\n\n")
34
+ return "".join(output)
35
+
36
+
37
+ ssum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "