kirinzhu commited on
Commit
e71e3dd
·
1 Parent(s): 8bd5464

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +44 -0
  2. dataset_extraction.py +36 -0
  3. description.py +13 -0
app.py CHANGED
@@ -105,7 +105,51 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
105
  outputs=ssum_str_output
106
  )
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
 
 
 
 
 
 
 
 
 
 
109
 
110
 
111
  demo.launch()
 
105
  outputs=ssum_str_output
106
  )
107
 
108
+ # Dataset Extraction
109
+ with gr.TabItem("Dataset Mentions Extraction"):
110
+ with gr.Box():
111
+ gr.Markdown(de_str_md)
112
+ with gr.Row():
113
+ with gr.Column():
114
+ de_str = gr.Textbox(label="Input String")
115
+ with gr.Row():
116
+ de_str_btn = gr.Button("Extract")
117
+ de_str_output = gr.HighlightedText(
118
+ elem_id="htext",
119
+ label="The Result of Extraction",
120
+ combine_adjacent=True,
121
+ adjacent_separator=" ",
122
+ )
123
+ de_str_examples = gr.Examples(examples=[["Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval)."],
124
+ ["Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval-2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval)."]],
125
+ inputs=[de_str])
126
+ with gr.Box():
127
+ gr.Markdown(de_file_md)
128
+ with gr.Row():
129
+ with gr.Column():
130
+ de_file = gr.File(label="Input File")
131
+ with gr.Row():
132
+ de_file_btn = gr.Button("Extract")
133
+
134
+ de_file_output = gr.HighlightedText(
135
+ elem_id="htext",
136
+ label="The Result of Extraction",
137
+ combine_adjacent=True,
138
+ adjacent_separator=" ",
139
+ )
140
+ de_file_examples = gr.Examples(examples=[["examples/N18-3011_ref.txt"],["examples/BERT_paper.pdf"]], inputs=[de_file])
141
+
142
 
143
+ de_file_btn.click(
144
+ fn=de_for_file,
145
+ inputs=[de_file],
146
+ outputs=de_file_output
147
+ )
148
+ de_str_btn.click(
149
+ fn=de_for_str,
150
+ inputs=[de_str],
151
+ outputs=de_str_output
152
+ )
153
 
154
 
155
  demo.launch()
dataset_extraction.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ from SciAssist import DatasetExtraction
4
+
5
+ device = "gpu" if torch.cuda.is_available() else "cpu"
6
+ de_pipeline = DatasetExtraction(os_name="nt")
7
+
8
+
9
+ def de_for_str(input) -> List[Tuple[str, str]]:
10
+ results = de_pipeline.extract(input, type="str", save_results=False)
11
+
12
+ output = []
13
+ for res in results["dataset_mentions"]:
14
+ output.append(f"{res}\n\n")
15
+ return "".join(output)
16
+
17
+
18
+ def de_for_file(input):
19
+ if input == None:
20
+ return None
21
+ filename = input.name
22
+ # Identify the format of input and parse reference strings
23
+ if filename[-4:] == ".txt":
24
+ results = de_pipeline.extract(filename, type="txt", save_results=False)
25
+ elif filename[-4:] == ".pdf":
26
+ results = de_pipeline.extract(filename, type="pdf", save_results=False)
27
+ else:
28
+ return [("File Format Error !", None)]
29
+
30
+ output = []
31
+ for res in results["dataset_mentions"]:
32
+ output.append(f"{res}\n\n")
33
+ return "".join(output)
34
+
35
+
36
+ de_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
description.py CHANGED
@@ -31,3 +31,16 @@ To **test on a file**, the input can be:
31
  **Note**: The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
32
 
33
  '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  **Note**: The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
32
 
33
  '''
34
+
35
+ de_str_md = '''
36
+ To **test on strings**, simply input a string.
37
+ '''
38
+
39
+ de_file_md = '''
40
+ To **test on a file**, the input can be:
41
+
42
+ - A txt file which contains the content to be extracted dataset mentions from.
43
+
44
+ - A pdf file which contains a whole scientific documention without any preprocessing (including title, author, body text...).
45
+
46
+ '''