File size: 3,313 Bytes
cf4432c
 
e526b4d
cf4432c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e526b4d
 
cf4432c
e526b4d
 
 
 
 
 
 
 
 
 
 
 
 
 
cf4432c
 
 
 
 
 
 
 
05fa727
cddc8f6
 
05fa727
 
16f8a3f
cddc8f6
 
 
05fa727
 
 
 
16f8a3f
 
cf4432c
 
 
 
 
 
cddc8f6
cf4432c
 
 
 
1f545ac
05fa727
 
 
1f545ac
 
 
 
 
 
cf4432c
 
 
 
 
 
05fa727
16f8a3f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import re
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import json

HEADERS = {
    'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/131.0.6778.135 Safari/537.36'
}

def fetch_pmc_doi(pmc_id):
    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?email=raannakasturi@gmail.com&ids={pmc_id}&format=json"
    response = requests.get(url, headers=HEADERS).json()
    if response['status'] == 'ok':
        doi = response['records'][0]['doi']
        return f"https://doi.org/{doi}"

def fetch_pmc_pdf(pmc_id):
    pdf_url = None
    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmc_id}&format=pdf"
    response = requests.get(url, headers=HEADERS).content
    try:
        ET.fromstring(response).find(".//error").text
        url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmc_id}/"
        response = requests.get(url, headers=HEADERS).content
        data = BeautifulSoup(response, 'html.parser')
        pdf_url = data.find("a", {"data-ga-label" : "pdf_download_desktop"}, href=True)
        if pdf_url:
            pdf_url = url + pdf_url['href']
        else:
            return None
    except Exception as e:
        pdf_url = ET.fromstring(response).find("records").find("record").find("link").attrib['href'].replace('ftp://', 'http://')
    finally:
        return pdf_url

def fetch_arxiv_doi(arxiv_id):
    page_url = f"https://arxiv.org/abs/{arxiv_id}"
    page_content = requests.get(page_url, headers=HEADERS).content
    page_data = BeautifulSoup(page_content, 'html.parser')
    doi = page_data.find('td', {'class': "tablecell arxivdoi"}).find('a', {'id': 'arxiv-doi-link'}).text
    return doi

def fetch_citation_title(doi):
    url = f"https://api.citeas.org/product/{doi}"
    response = requests.get(url)
    cite = None
    title = None
    if response.status_code == 200:
        citations_list = response.json()['citations']
        for citation in citations_list:
            if citation['style_shortname'] == 'apa':
                cite = citation['citation']
        if response.json()['name']:
            title = response.json()['name']
        return cite, title
    else:
        response.raise_for_status()

def fetch_paper_data(id):
    data = {}
    try:
        if id.startswith('PMC'):
            doi = fetch_pmc_doi(id)
            print(doi)
            pdf_url = fetch_pmc_pdf(id)
        else:
            doi = fetch_arxiv_doi(id)
            pdf_url = f"https://arxiv.org/pdf/{id}"
        if doi and pdf_url:
            citation, title = fetch_citation_title(doi)
            citation = citation.replace('\n', ' ').replace("<i>", "").replace("</i>", "").strip()
            title = title.replace('\n', ' ').replace("<i>", "").replace("</i>", "").strip()
            data['status'] = 'success'
            data['data'] = {}
            data['data']['doi'] = doi
            data['data']['title'] = title
            data['data']['pdf_url'] = pdf_url
            data['data']['citation'] = citation
    except Exception as e:
        data['status'] = 'error'
        print(str(e))
    return json.dumps(data, indent=4, ensure_ascii=False)

if __name__ == '__main__':
    data = fetch_paper_data('2501.06029')
    print(data)