File size: 3,888 Bytes
aae312e f62a8d2 ee63964 f62a8d2 aae312e f62a8d2 aae312e 83ccb99 f62a8d2 aae312e f62a8d2 aae312e f62a8d2 aae312e f62a8d2 aae312e ee63964 f62a8d2 aae312e f62a8d2 aae312e f62a8d2 aae312e f62a8d2 aae312e f62a8d2 aae312e f62a8d2 aae312e ee63964 f62a8d2 aae312e f62a8d2 aae312e f62a8d2 aae312e f62a8d2 aae312e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
#!/usr/bin/env python3
"""
MedGenesis β arXiv async fetcher (Atom API).
* Uses HTTPS (`https://export.arxiv.org/...`) to avoid HTTP 301 redirects.
* Async httpx fetch with 2Γ/4Γ exponential-back-off retry.
* Parses the Atom feed with feedparser inside a thread (non-blocking).
* 6-hour LRU cache keyed by βquery+max_resultsβ.
* Returns a list of dicts matching schemas.Paper.
API docs: https://arxiv.org/help/api/user-manual
"""
from __future__ import annotations
import asyncio
from functools import lru_cache
from typing import List, Dict
from urllib.parse import quote_plus
import feedparser
import httpx
_BASE = "https://export.arxiv.org/api/query?search_query="
_TIMEOUT = 10
_MAX_RES = 25
_HEADERS = {"User-Agent": "MedGenesis/1.0 (https://huggingface.co/spaces)"}
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Internal fetch helper with retry
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
async def _fetch_raw(query: str, max_results: int, *, retries: int = 3) -> str:
"""Return raw Atom XML from arXiv."""
max_results = max(1, min(max_results, _MAX_RES))
url = f"{_BASE}{quote_plus(query)}&max_results={max_results}"
delay = 2
last: httpx.Response | None = None
for _ in range(retries):
async with httpx.AsyncClient(timeout=_TIMEOUT, headers=_HEADERS) as cli:
last = await cli.get(url)
if last.status_code == 200:
return last.text
await asyncio.sleep(delay)
delay *= 2
raise RuntimeError(f"arXiv API failed: {last.status_code if last else 'No response'}")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Public cached fetch + parse
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@lru_cache(maxsize=256)
async def fetch_arxiv(query: str, *, max_results: int = 5) -> List[Dict]:
"""Return arXiv paper dicts compatible with schemas.Paper."""
xml_text = await _fetch_raw(query, max_results)
# feedparser is blocking; parse in thread
feed = await asyncio.to_thread(feedparser.parse, xml_text)
papers: List[Dict] = []
for ent in feed.entries:
authors = (
", ".join(a.name for a in getattr(ent, "authors", []))
if hasattr(ent, "authors") else "Unknown"
)
papers.append({
"title" : getattr(ent, "title", "[No title]"),
"authors" : authors,
"summary" : getattr(ent, "summary", ""),
"link" : getattr(ent, "link", ""),
"published": getattr(ent, "published", ""),
"source" : "arXiv",
})
return papers
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# CLI demo
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if __name__ == "__main__":
async def _demo():
papers = await fetch_arxiv("glioblastoma CRISPR", max_results=3)
for p in papers:
print(p["title"])
asyncio.run(_demo())
|