Spaces:
Runtime error
Runtime error
File size: 1,659 Bytes
63135a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from collections import namedtuple # later use py3.7 dataclasses
import urllib
import feedparser
import pdb
ArxivPaper = namedtuple("ArxivPaper", ["title", "authors", "abstract", "linktopdf", "linktoabs"])
def arxiv_url_sanitizer(url):
"""
as of now, just converts
arxiv.org/pdf/ to arxiv.org/abs
"""
# if its an arxiv pdf url then
if url.find("pdf") != -1:
url = url.replace("/pdf","/abs")
url = url.replace(".pdf","")
return url
def get_paper_info(url):
"""
Given an arxiv url returns
a ArxivPaper object with fields
title : str
authors : str
abstract : str
linktopdf : str
linktoabs : str
"""
arxiv_id = url.split("/")[-1]
arxiv_searchurl = "http://export.arxiv.org/api/query?id_list={}".format(arxiv_id)
try:
atom_feed = urllib.request.urlopen(arxiv_searchurl)
except urllib.error.HTTPError as e:
# print("Couldn't retrieve : {}".format(arxiv_searchurl))
raise RuntimeError("Trouble fetching ArXiv Id : {}".format(arxiv_id))
parsed_feed = feedparser.parse(atom_feed)
paper = parsed_feed["entries"][0]
title = paper["title"]
authors = paper["authors"]
abstract = paper["summary"]
linktopdf = None
linktoabs = None
for link_dict in paper["links"]:
if link_dict["type"].find("html") != -1:
linktoabs = link_dict["href"]
elif link_dict["type"].find("pdf")!= -1:
linktopdf = link_dict["href"]
# comment = paper["arxiv_comment"] # Not there in all arxiv pages.
return ArxivPaper(title, authors, abstract, linktopdf, linktoabs)
|