kirinzhu commited on
Commit
831abcc
1 Parent(s): 1a8f8aa

Upload 5 files

Browse files
Files changed (4) hide show
  1. app.py +37 -48
  2. controlled_summarization.py +59 -0
  3. dataset_extraction.py +1 -1
  4. description.py +23 -3
app.py CHANGED
@@ -2,12 +2,46 @@ import gradio as gr
2
  from description import *
3
 
4
  from reference_string_parsing import *
5
- from summarization import *
6
- from dataset_extraction import *
7
 
8
  with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
9
  gr.Markdown("# Gradio Demo for SciAssist")
10
  with gr.Tabs():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # Reference String Parsing
12
  with gr.TabItem("Reference String Parsing"):
13
  with gr.Box():
@@ -60,51 +94,6 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
60
  outputs=rsp_str_output
61
  )
62
 
63
- # Single Document Summarization
64
- with gr.TabItem("Single Document Summarization"):
65
- with gr.Box():
66
- gr.Markdown(ssum_str_md)
67
- with gr.Row():
68
- with gr.Column():
69
- ssum_str = gr.Textbox(label="Input String")
70
- with gr.Column():
71
- ssum_str_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
72
- ssum_str_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
73
- with gr.Row():
74
- ssum_str_btn = gr.Button("Generate")
75
- ssum_str_output = gr.Textbox(
76
- elem_id="htext",
77
- label="Summary",
78
- )
79
- ssum_str_examples = gr.Examples(examples=[[ssum_str_example, 1, 1], ],
80
- inputs=[ssum_str, ssum_str_beams, ssum_str_sequences])
81
- with gr.Box():
82
- gr.Markdown(ssum_file_md)
83
- with gr.Row():
84
- with gr.Column():
85
- ssum_file = gr.File(label="Input File")
86
- with gr.Column():
87
- ssum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
88
- ssum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
89
- with gr.Row():
90
- ssum_file_btn = gr.Button("Generate")
91
- ssum_file_output = gr.Textbox(
92
- elem_id="htext",
93
- label="Summary",
94
- )
95
- ssum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt", 10, 2],["examples/BERT_paper.pdf", 1, 1]],
96
- inputs=[ssum_file, ssum_file_beams, ssum_file_sequences])
97
-
98
- ssum_file_btn.click(
99
- fn=ssum_for_file,
100
- inputs=[ssum_file, ssum_file_beams, ssum_file_sequences],
101
- outputs=ssum_file_output
102
- )
103
- ssum_str_btn.click(
104
- fn=ssum_for_str,
105
- inputs=[ssum_str, ssum_str_beams, ssum_str_sequences],
106
- outputs=ssum_str_output
107
- )
108
 
109
  # Dataset Extraction
110
  with gr.TabItem("Dataset Mentions Extraction"):
@@ -153,4 +142,4 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
153
  )
154
 
155
 
156
- demo.launch()
 
2
  from description import *
3
 
4
  from reference_string_parsing import *
5
+ from controlled_summarization import *
 
6
 
7
  with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
8
  gr.Markdown("# Gradio Demo for SciAssist")
9
  with gr.Tabs():
10
+
11
+ # Controlled Summarization
12
+ with gr.TabItem("Summarization"):
13
+
14
+ with gr.Box():
15
+ gr.Markdown(ctrlsum_file_md)
16
+ with gr.Row():
17
+ with gr.Column():
18
+ ctrlsum_file = gr.File(label="Input File")
19
+ ctrlsum_str = gr.TextArea(label="Input String")
20
+ with gr.Column():
21
+ gr.Markdown("* Length 0 will exert no control over length.")
22
+ # ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
23
+ # ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
24
+ ctrlsum_file_length = gr.Slider(0,300,step=50, label="Length")
25
+ ctrlsum_file_keywords = gr.Textbox(label="Keywords",max_lines=1)
26
+ with gr.Row():
27
+ ctrlsum_file_btn = gr.Button("Generate")
28
+ ctrlsum_file_output = gr.Textbox(
29
+ elem_id="htext",
30
+ label="Summary",
31
+ )
32
+ ctrlsum_file_examples = gr.Examples(examples=[["examples/BERT_body.txt", 100, "", ""],["examples/BERT_paper.pdf", 0, "BERT"]],
33
+ inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords])
34
+
35
+ ctrlsum_file_btn.click(
36
+ fn=ctrlsum_for_file,
37
+ inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str],
38
+ outputs=[ctrlsum_file_output, ctrlsum_str]
39
+ )
40
+ def clear():
41
+ return None,0,None
42
+
43
+ ctrlsum_file.change(clear, inputs=None,outputs=[ctrlsum_str,ctrlsum_file_length,ctrlsum_file_keywords])
44
+
45
  # Reference String Parsing
46
  with gr.TabItem("Reference String Parsing"):
47
  with gr.Box():
 
94
  outputs=rsp_str_output
95
  )
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  # Dataset Extraction
99
  with gr.TabItem("Dataset Mentions Extraction"):
 
142
  )
143
 
144
 
145
+ demo.launch(share=False)
controlled_summarization.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ from SciAssist import Summarization
4
+
5
+ device = "gpu" if torch.cuda.is_available() else "cpu"
6
+
7
+ ctrlsum_pipeline = Summarization(os_name="nt",checkpoint="google/flan-t5-base")
8
+
9
+
10
+ def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
11
+
12
+ if keywords is not None:
13
+ keywords = keywords.strip().split(",")
14
+ if keywords[0] == "":
15
+ keywords = None
16
+ if length==0 or length is None:
17
+ length = None
18
+ results = ctrlsum_pipeline.predict(input, type="str",
19
+ length=length, keywords=keywords)
20
+
21
+ output = []
22
+ for res in results["summary"]:
23
+ output.append(f"{res}\n\n")
24
+ return "".join(output)
25
+
26
+
27
+ def ctrlsum_for_file(input, length=None, keywords=None, text="") -> List[Tuple[str, str]]:
28
+ if input == None:
29
+ if text=="":
30
+ return None
31
+ else:
32
+ return ctrlsum_for_str(text,length,keywords),text
33
+ else:
34
+ filename = input.name
35
+ if keywords is not None:
36
+ keywords = keywords.strip().split(",")
37
+ if keywords[0] == "":
38
+ keywords = None
39
+ if length==0:
40
+ length = None
41
+ # Identify the format of input and parse reference strings
42
+ if filename[-4:] == ".txt":
43
+ results = ctrlsum_pipeline.predict(filename, type="txt",
44
+ save_results=False,
45
+ length=length, keywords=keywords)
46
+ elif filename[-4:] == ".pdf":
47
+ results = ctrlsum_pipeline.predict(filename,
48
+ save_results=False, length=length, keywords=keywords)
49
+ else:
50
+ return [("File Format Error !", None)]
51
+
52
+ output = []
53
+ for res in results["summary"]:
54
+ output.append(f"{res}\n\n")
55
+ return "".join(output), results["raw_text"]
56
+
57
+
58
+
59
+ ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
dataset_extraction.py CHANGED
@@ -6,7 +6,7 @@ device = "gpu" if torch.cuda.is_available() else "cpu"
6
  de_pipeline = DatasetExtraction(os_name="nt")
7
 
8
 
9
- def de_for_str(input) -> List[Tuple[str, str]]:
10
  results = de_pipeline.extract(input, type="str", save_results=False)
11
 
12
  output = []
 
6
  de_pipeline = DatasetExtraction(os_name="nt")
7
 
8
 
9
+ def de_for_str(input):
10
  results = de_pipeline.extract(input, type="str", save_results=False)
11
 
12
  output = []
description.py CHANGED
@@ -16,8 +16,6 @@ To **test on a file**, the input can be:
16
  ssum_str_md = '''
17
  To **test on strings**, simply input a string.
18
 
19
- **Note**: The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
20
-
21
  '''
22
 
23
  ssum_file_md = '''
@@ -28,10 +26,32 @@ To **test on a file**, the input can be:
28
  - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
29
 
30
 
31
- **Note**: The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  '''
34
 
 
 
35
  de_str_md = '''
36
  To **test on strings**, simply input a string.
37
  '''
 
16
  ssum_str_md = '''
17
  To **test on strings**, simply input a string.
18
 
 
 
19
  '''
20
 
21
  ssum_file_md = '''
 
26
  - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
27
 
28
 
29
+ '''
30
+
31
+ # - The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
32
+ ctrlsum_str_md = '''
33
+ To **test on strings**, simply input a string.
34
+
35
+ **Note**:
36
+
37
+ - Length 0 will exert no control over length.
38
+
39
+
40
+ '''
41
+
42
+ ctrlsum_file_md = '''
43
+ To **test on a file**, the input can be:
44
+
45
+ - A txt file which contains the content to be summarized.
46
+
47
+ - A pdf file which contains a whole scientific documention without any preprocessing(including title, author, body text...).
48
+
49
+
50
 
51
  '''
52
 
53
+
54
+
55
  de_str_md = '''
56
  To **test on strings**, simply input a string.
57
  '''