ReXplorePaperDataFetcher / fetch_paper_data.py
raannakasturi's picture
Refactor citation fetching to combine citation and title retrieval in fetch_citation_title function
05fa727
import re
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import json
HEADERS = {
'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
}
def fetch_pmc_doi(pmc_id):
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={pmc_id}&format=json"
response = requests.get(url, headers=HEADERS).json()
if response['status'] == 'ok':
doi = response['records'][0]['doi']
return f"https://doi.org/{doi}"
def fetch_pmc_pdf(pmc_id):
pdf_url = None
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmc_id}&format=pdf"
response = requests.get(url, headers=HEADERS).content
try:
ET.fromstring(response).find(".//error").text
url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmc_id}/"
response = requests.get(url, headers=HEADERS).content
data = BeautifulSoup(response, 'html.parser')
pdf_url = data.find("a", {"data-ga-label" : "pdf_download_desktop"}, href=True)
if pdf_url:
pdf_url = url + pdf_url['href']
else:
return None
except Exception as e:
pdf_url = ET.fromstring(response).find("records").find("record").find("link").attrib['href'].replace('ftp://', 'http://')
finally:
return pdf_url
def fetch_arxiv_doi(arxiv_id):
page_url = f"https://arxiv.org/abs/{arxiv_id}"
page_content = requests.get(page_url, headers=HEADERS).content
page_data = BeautifulSoup(page_content, 'html.parser')
doi = page_data.find('td', {'class': "tablecell arxivdoi"}).find('a', {'id': 'arxiv-doi-link'}).text
return doi
def fetch_citation_title(doi):
url = f"https://api.citeas.org/product/{doi}"
response = requests.get(url)
cite = None
title = None
if response.status_code == 200:
citations_list = response.json()['citations']
for citation in citations_list:
if citation['style_shortname'] == 'apa':
cite = citation['citation']
if response.json()['name']:
title = response.json()['name']
return cite, title
else:
response.raise_for_status()
def fetch_paper_data(id):
data = {}
try:
if id.startswith('PMC'):
doi = fetch_pmc_doi(id)
print(doi)
pdf_url = fetch_pmc_pdf(id)
else:
doi = fetch_arxiv_doi(id)
pdf_url = f"https://arxiv.org/pdf/{id}"
if doi and pdf_url:
citation, title = fetch_citation_title(doi)
citation = citation.replace('\n', ' ').replace("<i>", "").replace("</i>", "").strip()
title = title.replace('\n', ' ').replace("<i>", "").replace("</i>", "").strip()
data['status'] = 'success'
data['data'] = {}
data['data']['doi'] = doi
data['data']['title'] = title
data['data']['pdf_url'] = pdf_url
data['data']['citation'] = citation
except Exception as e:
data['status'] = 'error'
print(str(e))
return json.dumps(data, indent=4, ensure_ascii=False)
if __name__ == '__main__':
data = fetch_paper_data('2501.06029')
print(data)