Spaces:
Sleeping
Sleeping
File size: 9,091 Bytes
3b85924 7236411 44257a9 3b85924 44257a9 3b85924 7236411 44257a9 3b85924 44257a9 3b85924 44257a9 3b85924 7236411 44257a9 3b85924 44257a9 3b85924 44257a9 7236411 44257a9 7236411 44257a9 7236411 3b85924 44257a9 3b85924 44257a9 3b85924 44257a9 3b85924 7236411 3b85924 7236411 3b85924 44257a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
from typing import List, Tuple
import torch
from SciAssist import Summarization
import os
import requests
from datasets import load_dataset
acl_data = load_dataset("dyxohjl666/CocoScisum_ACL", revision="refs/convert/parquet")
device = "gpu" if torch.cuda.is_available() else "cpu"
ctrlsum_pipeline = Summarization(os_name="nt",device=device)
acl_dict = {}
recommended_kw = {}
def convert_to_dict(data):
""" Dict:
{ url:
{length:
{keywords: summary};
raw_text:
str;
}
}
"""
url = data["url"]
text = data["text"]
keywords = data["keywords"]
length = data["length"]
summary = data["summary"]
for u, t, k, l, s in zip(url, text, keywords, length, summary):
if len(u) < 5:
continue
u = u + ".pdf"
if k == None:
k = ""
if l == None:
l = ""
k = str(k).strip()
l = str(l).strip()
if u in acl_dict.keys():
if k in acl_dict[u][l].keys():
continue
else:
acl_dict[u][l][k] = s
else:
acl_dict[u] = {"": {}, "50": {}, "100": {}, "200": {}, "raw_text": t}
# kws
if u in recommended_kw.keys():
if k == "" or k in recommended_kw[u]:
continue
else:
recommended_kw[u].append(k)
else:
recommended_kw[u] = []
return 1
for i in acl_data.keys():
signal = convert_to_dict(acl_data[i])
def download_pdf(url, dest_folder):
"""
Download a PDF from a given URL and save it to a specified destination folder.
Parameters:
url (str): URL of the PDF
dest_folder (str): Destination folder to save the downloaded PDF
"""
if not os.path.exists(dest_folder):
os.makedirs(dest_folder)
response = requests.get(url, stream=True)
filename = os.path.join(dest_folder, url.split("/")[-1])
with open(filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
print(f"Downloaded {url} to {filename}")
return filename
def ctrlsum_for_str(input, length=None, keywords=None) -> List[Tuple[str, str]]:
if keywords is not None:
keywords = keywords.strip().split(",")
if keywords[0] == "":
keywords = None
if length == 0 or length is None:
length = None
results = ctrlsum_pipeline.predict(input, type="str",
length=length, keywords=keywords, num_beams=1)
output = []
for res in results["summary"]:
output.append(f"{res}\n\n")
return "".join(output)
def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]:
if input == None and url == "":
if text == "":
return None, "Input cannot be left blank.", None
else:
return ctrlsum_for_str(text, length, keywords), text, None
else:
filename = ""
url = url.strip()
if url != "":
if len(url) > 4 and url[-3:] == "pdf":
if url.strip() in acl_dict.keys():
raw_text = acl_dict[url]["raw_text"]
l = str(length)
if length == 0:
l = ""
if l in acl_dict[url].keys():
if keywords.strip() in acl_dict[url][l].keys():
summary = acl_dict[url][l][keywords]
return summary, raw_text, None
if keywords.strip() == "":
keywords = None
if l == "":
l = None
return ctrlsum_for_str(raw_text, l, keywords), raw_text, None
filename = download_pdf(url, './cache/')
else:
"Invalid url(Not PDF)!", None, None
else:
filename = input.name
if keywords != "":
keywords = keywords.strip().split(",")
if keywords[0] == "":
keywords = None
if length == 0:
length = None
# Identify the format of input and parse reference strings
if filename[-4:] == ".txt":
results = ctrlsum_pipeline.predict(filename, type="txt",
save_results=False,
length=length, keywords=keywords, num_beams=1)
elif filename[-4:] == ".pdf":
results = ctrlsum_pipeline.predict(filename,
save_results=False, length=length, keywords=keywords, num_beams=1)
else:
return "File Format Error !", None, filename
output = []
for res in results["summary"]:
output.append(f"{res}\n\n")
return "".join(output), results["raw_text"], filename
ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
|