Spaces:

raannakasturi
/

ReXplorePaperDataFetcher

Running

File size: 3,313 Bytes

cf4432c
 
e526b4d
cf4432c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e526b4d
 
cf4432c
e526b4d
 
 
 
 
 
 
 
 
 
 
 
 
 
cf4432c
 
 
 
 
 
 
 
05fa727
cddc8f6
 
05fa727
 
16f8a3f
cddc8f6
 
 
05fa727
 
 
 
16f8a3f
 
cf4432c
 
 
 
 
 
cddc8f6
cf4432c
 
 
 
1f545ac
05fa727
 
 
1f545ac
 
 
 
 
 
cf4432c
 
 
 
 
 
05fa727
16f8a3f

import re
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import json

HEADERS = {
    'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
}

def fetch_pmc_doi(pmc_id):
    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={pmc_id}&format=json"
    response = requests.get(url, headers=HEADERS).json()
    if response['status'] == 'ok':
        doi = response['records'][0]['doi']
        return f"https://doi.org/{doi}"

def fetch_pmc_pdf(pmc_id):
    pdf_url = None
    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmc_id}&format=pdf"
    response = requests.get(url, headers=HEADERS).content
    try:
        ET.fromstring(response).find(".//error").text
        url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmc_id}/"
        response = requests.get(url, headers=HEADERS).content
        data = BeautifulSoup(response, 'html.parser')
        pdf_url = data.find("a", {"data-ga-label" : "pdf_download_desktop"}, href=True)
        if pdf_url:
            pdf_url = url + pdf_url['href']
        else:
            return None
    except Exception as e:
        pdf_url = ET.fromstring(response).find("records").find("record").find("link").attrib['href'].replace('ftp://', 'http://')
    finally:
        return pdf_url

def fetch_arxiv_doi(arxiv_id):
    page_url = f"https://arxiv.org/abs/{arxiv_id}"
    page_content = requests.get(page_url, headers=HEADERS).content
    page_data = BeautifulSoup(page_content, 'html.parser')
    doi = page_data.find('td', {'class': "tablecell arxivdoi"}).find('a', {'id': 'arxiv-doi-link'}).text
    return doi

def fetch_citation_title(doi):
    url = f"https://api.citeas.org/product/{doi}"
    response = requests.get(url)
    cite = None
    title = None
    if response.status_code == 200:
        citations_list = response.json()['citations']
        for citation in citations_list:
            if citation['style_shortname'] == 'apa':
                cite = citation['citation']
        if response.json()['name']:
            title = response.json()['name']
        return cite, title
    else:
        response.raise_for_status()

def fetch_paper_data(id):
    data = {}
    try:
        if id.startswith('PMC'):
            doi = fetch_pmc_doi(id)
            print(doi)
            pdf_url = fetch_pmc_pdf(id)
        else:
            doi = fetch_arxiv_doi(id)
            pdf_url = f"https://arxiv.org/pdf/{id}"
        if doi and pdf_url:
            citation, title = fetch_citation_title(doi)
            citation = citation.replace('\n', ' ').replace("<i>", "").replace("</i>", "").strip()
            title = title.replace('\n', ' ').replace("<i>", "").replace("</i>", "").strip()
            data['status'] = 'success'
            data['data'] = {}
            data['data']['doi'] = doi
            data['data']['title'] = title
            data['data']['pdf_url'] = pdf_url
            data['data']['citation'] = citation
    except Exception as e:
        data['status'] = 'error'
        print(str(e))
    return json.dumps(data, indent=4, ensure_ascii=False)

if __name__ == '__main__':
    data = fetch_paper_data('2501.06029')
    print(data)