paper_qa / paper /download.py
chansung's picture
update
928f123
raw
history blame
3.66 kB
import re
import json
import requests
import datetime
from datetime import date
from datetime import datetime
import xml.etree.ElementTree as ET
from requests.exceptions import HTTPError
def _get_today():
return str(date.today())
def _download_pdf_from_arxiv(filename):
url = f'https://arxiv.org/pdf/{filename}'
response = requests.get(url)
if response.status_code == 200:
return response.content
else:
raise Exception(f"Failed to download pdf for arXiv id {filename}")
def download_pdf_from_arxiv(arxiv_id):
filename = f"{arxiv_id}.pdf"
pdf_content = _download_pdf_from_arxiv(filename)
# Save the pdf content to a file
with open(filename, "wb") as f:
f.write(pdf_content)
return filename
def _get_papers_from_hf_daily_papers(target_date):
if target_date is None:
target_date = _get_today()
print(f"target_date is not set => scrap today's papers [{target_date}]")
url = f"https://huggingface.co/api/daily_papers?date={target_date}"
response = requests.get(url)
if response.status_code == 200:
return target_date, response.text
else:
raise HTTPError(f"Error fetching data. Status code: {response.status_code}")
def get_papers_from_hf_daily_papers(target_date):
target_date, results = _get_papers_from_hf_daily_papers(target_date)
results = json.loads(results)
for result in results:
result["target_date"] = target_date
return target_date, results
def _get_paper_xml_by_arxiv_id(arxiv_id):
url = f"http://export.arxiv.org/api/query?search_query=id:{arxiv_id}&start=0&max_results=1"
return requests.get(url)
def _is_arxiv_id_valid(arxiv_id):
pattern = r"^\d{4}\.\d{5}$"
return bool(re.match(pattern, arxiv_id))
def _get_paper_metadata_by_arxiv_id(response):
root = ET.fromstring(response.content)
# Example: Extracting title, authors, and abstract
title = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}title').text
authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in root.findall('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}author')]
abstract = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}summary').text
target_date = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}published').text
return title, authors, abstract, target_date
def get_papers_from_arxiv_ids(arxiv_ids):
results = []
for arxiv_id in arxiv_ids:
print(arxiv_id)
if _is_arxiv_id_valid(arxiv_id):
try:
xml_data = _get_paper_xml_by_arxiv_id(arxiv_id)
title, authors, abstract, target_date = _get_paper_metadata_by_arxiv_id(xml_data)
datetime_obj = datetime.strptime(target_date, "%Y-%m-%dT%H:%M:%SZ")
formatted_date = datetime_obj.strftime("%Y-%m-%d")
results.append(
{
"title": title,
"target_date": formatted_date,
"paper": {
"summary": abstract,
"id": arxiv_id,
"authors" : authors,
}
}
)
except:
print("......something wrong happend when downloading metadata")
print("......this usually happens when you try out the today's published paper")
continue
else:
print(f"......not a valid arXiv ID[{arxiv_id}]")
return results