File size: 3,888 Bytes
aae312e
f62a8d2
 
ee63964
f62a8d2
 
 
 
 
aae312e
 
 
 
 
f62a8d2
aae312e
 
83ccb99
f62a8d2
 
aae312e
 
f62a8d2
aae312e
 
 
 
f62a8d2
 
 
 
aae312e
f62a8d2
aae312e
 
 
 
 
 
 
 
 
 
 
 
ee63964
 
f62a8d2
 
 
aae312e
 
f62a8d2
aae312e
 
f62a8d2
aae312e
 
f62a8d2
aae312e
f62a8d2
 
 
 
 
aae312e
 
 
 
f62a8d2
aae312e
ee63964
f62a8d2
aae312e
 
f62a8d2
aae312e
f62a8d2
aae312e
 
 
f62a8d2
 
aae312e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3
"""
MedGenesis – arXiv async fetcher (Atom API).

* Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects.
* Async httpx fetch with 2Γ—/4Γ— exponential-back-off retry.
* Parses the Atom feed with feedparser inside a thread (non-blocking).
* 6-hour LRU cache keyed by β€œquery+max_results”.
* Returns a list of dicts matching schemas.Paper.

API docs: https://arxiv.org/help/api/user-manual
"""
from __future__ import annotations

import asyncio
from functools import lru_cache
from typing import List, Dict
from urllib.parse import quote_plus

import feedparser
import httpx

_BASE   = "https://export.arxiv.org/api/query?search_query="
_TIMEOUT = 10
_MAX_RES = 25
_HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}


# ──────────────────────────────────────────────────────────────────────
# Internal fetch helper with retry
# ──────────────────────────────────────────────────────────────────────
async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
    """Return raw Atom XML from arXiv."""
    max_results = max(1, min(max_results, _MAX_RES))
    url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
    delay = 2
    last: httpx.Response | None = None
    for _ in range(retries):
        async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
            last = await cli.get(url)
            if last.status_code == 200:
                return last.text
        await asyncio.sleep(delay)
        delay *= 2
    raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")


# ──────────────────────────────────────────────────────────────────────
# Public cached fetch + parse
# ──────────────────────────────────────────────────────────────────────
@lru_cache(maxsize=256)
async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
    """Return arXiv paper dicts compatible with schemas.Paper."""
    xml_text = await _fetch_raw(query, max_results)

    # feedparser is blocking; parse in thread
    feed = await asyncio.to_thread(feedparser.parse, xml_text)

    papers: List[Dict] = []
    for ent in feed.entries:
        authors = (
            ", ".join(a.name for a in getattr(ent, "authors", []))
            if hasattr(ent, "authors") else "Unknown"
        )
        papers.append({
            "title"    : getattr(ent, "title", "[No title]"),
            "authors"  : authors,
            "summary"  : getattr(ent, "summary", ""),
            "link"     : getattr(ent, "link", ""),
            "published": getattr(ent, "published", ""),
            "source"   : "arXiv",
        })
    return papers


# ──────────────────────────────────────────────────────────────────────
# CLI demo
# ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    async def _demo():
        papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
        for p in papers:
            print(p["title"])
    asyncio.run(_demo())