Spaces:
Runtime error
Runtime error
from collections import namedtuple # later use py3.7 dataclasses | |
import urllib | |
import feedparser | |
import pdb | |
ArxivPaper = namedtuple("ArxivPaper", ["title", "authors", "abstract", "linktopdf", "linktoabs"]) | |
def arxiv_url_sanitizer(url): | |
""" | |
as of now, just converts | |
arxiv.org/pdf/ to arxiv.org/abs | |
""" | |
# if its an arxiv pdf url then | |
if url.find("pdf") != -1: | |
url = url.replace("/pdf","/abs") | |
url = url.replace(".pdf","") | |
return url | |
def get_paper_info(url): | |
""" | |
Given an arxiv url returns | |
a ArxivPaper object with fields | |
title : str | |
authors : str | |
abstract : str | |
linktopdf : str | |
linktoabs : str | |
""" | |
arxiv_id = url.split("/")[-1] | |
arxiv_searchurl = "http://export.arxiv.org/api/query?id_list={}".format(arxiv_id) | |
try: | |
atom_feed = urllib.request.urlopen(arxiv_searchurl) | |
except urllib.error.HTTPError as e: | |
# print("Couldn't retrieve : {}".format(arxiv_searchurl)) | |
raise RuntimeError("Trouble fetching ArXiv Id : {}".format(arxiv_id)) | |
parsed_feed = feedparser.parse(atom_feed) | |
paper = parsed_feed["entries"][0] | |
title = paper["title"] | |
authors = paper["authors"] | |
abstract = paper["summary"] | |
linktopdf = None | |
linktoabs = None | |
for link_dict in paper["links"]: | |
if link_dict["type"].find("html") != -1: | |
linktoabs = link_dict["href"] | |
elif link_dict["type"].find("pdf")!= -1: | |
linktopdf = link_dict["href"] | |
# comment = paper["arxiv_comment"] # Not there in all arxiv pages. | |
return ArxivPaper(title, authors, abstract, linktopdf, linktoabs) | |