import re import json import requests import datetime from datetime import date from datetime import datetime import xml.etree.ElementTree as ET from requests.exceptions import HTTPError def _get_today(): return str(date.today()) def _download_pdf_from_arxiv(filename): url = f'https://arxiv.org/pdf/{filename}' response = requests.get(url) if response.status_code == 200: return response.content else: raise Exception(f"Failed to download pdf for arXiv id {filename}") def download_pdf_from_arxiv(arxiv_id): filename = f"{arxiv_id}.pdf" pdf_content = _download_pdf_from_arxiv(filename) # Save the pdf content to a file with open(filename, "wb") as f: f.write(pdf_content) return filename def _get_papers_from_hf_daily_papers(target_date): if target_date is None: target_date = _get_today() print(f"target_date is not set => scrap today's papers [{target_date}]") url = f"https://huggingface.co/api/daily_papers?date={target_date}" response = requests.get(url) if response.status_code == 200: return target_date, response.text else: raise HTTPError(f"Error fetching data. Status code: {response.status_code}") def get_papers_from_hf_daily_papers(target_date): target_date, results = _get_papers_from_hf_daily_papers(target_date) results = json.loads(results) for result in results: result["target_date"] = target_date return target_date, results def _get_paper_xml_by_arxiv_id(arxiv_id): url = f"http://export.arxiv.org/api/query?search_query=id:{arxiv_id}&start=0&max_results=1" return requests.get(url) def _is_arxiv_id_valid(arxiv_id): pattern = r"^\d{4}\.\d{5}$" return bool(re.match(pattern, arxiv_id)) def _get_paper_metadata_by_arxiv_id(response): root = ET.fromstring(response.content) # Example: Extracting title, authors, and abstract title = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}title').text authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in root.findall('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}author')] abstract = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}summary').text target_date = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}published').text return title, authors, abstract, target_date def get_papers_from_arxiv_ids(arxiv_ids): results = [] for arxiv_id in arxiv_ids: print(arxiv_id) if _is_arxiv_id_valid(arxiv_id): try: xml_data = _get_paper_xml_by_arxiv_id(arxiv_id) title, authors, abstract, target_date = _get_paper_metadata_by_arxiv_id(xml_data) datetime_obj = datetime.strptime(target_date, "%Y-%m-%dT%H:%M:%SZ") formatted_date = datetime_obj.strftime("%Y-%m-%d") results.append( { "title": title, "target_date": formatted_date, "paper": { "summary": abstract, "id": arxiv_id, "authors" : authors, } } ) except: print("......something wrong happend when downloading metadata") print("......this usually happens when you try out the today's published paper") continue else: print(f"......not a valid arXiv ID[{arxiv_id}]") return results