Spaces:
Running
Running
File size: 4,838 Bytes
a0d8a91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# api_clients/pubmed_client.py
"""
Client for the PubMed API via NCBI's Entrez E-utilities.
This module is expertly crafted to perform a two-step search: first finding
relevant article IDs (PMIDs) and then fetching their structured summaries.
It intelligently prioritizes review articles to provide high-quality,
synthesized information to the main orchestrator.
"""
import aiohttp
from .config import PUBMED_BASE_URL, REQUEST_HEADERS
async def search_pubmed(session: aiohttp.ClientSession, query: str, max_results: int = 5) -> list[dict]:
"""
Searches PubMed and returns a list of article summaries.
This function implements an intelligent search strategy:
1. It searches for article IDs (PMIDs) matching the query within the title/abstract.
2. It specifically filters for "review" articles, which are ideal for summarization.
3. It then fetches concise summaries for the found PMIDs.
Args:
session (aiohttp.ClientSession): The active HTTP session.
query (str): The search term, likely a combination of concepts (e.g., "Migraine AND Aura").
max_results (int): The maximum number of article summaries to return.
Returns:
list[dict]: A list of dictionaries, each containing summary data for an article.
Returns an empty list if no results are found or an error occurs.
"""
if not query:
return []
# --- Step 1: ESearch - Find relevant article PMIDs ---
# We construct a powerful query to get the most relevant results.
# - `[Title/Abstract]`: Focuses the search on the most important parts of the paper.
# - `AND review[Publication Type]`: Narrows results to high-value review articles.
# - `sort=relevance`: Ensures the best matches appear first.
search_term = f"({query}) AND review[Publication Type]"
esearch_params = {
'db': 'pubmed',
'term': search_term,
'retmode': 'json',
'retmax': max_results,
'sort': 'relevance'
}
esearch_url = f"{PUBMED_BASE_URL}/esearch.fcgi"
pmids = []
try:
async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS, timeout=10) as resp:
resp.raise_for_status()
data = await resp.json()
pmids = data.get('esearchresult', {}).get('idlist', [])
if not pmids:
# If no review articles are found, try a broader search as a fallback
print(f"No review articles found for '{query}'. Broadening search...")
esearch_params['term'] = query # Remove the review filter
async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS) as fallback_resp:
fallback_resp.raise_for_status()
fallback_data = await fallback_resp.json()
pmids = fallback_data.get('esearchresult', {}).get('idlist', [])
if not pmids:
print(f"No PubMed results found for query: {query}")
return []
# --- Step 2: ESummary - Fetch summaries for the found PMIDs ---
esummary_params = {
'db': 'pubmed',
'id': ",".join(pmids), # E-utilities can take a comma-separated list of IDs
'retmode': 'json'
}
esummary_url = f"{PUBMED_BASE_URL}/esummary.fcgi"
async with session.get(esummary_url, params=esummary_params, headers=REQUEST_HEADERS, timeout=15) as resp:
resp.raise_for_status()
summary_data = await resp.json()
# The result is a dict with a 'result' key, which contains another dict
# where keys are the PMIDs. We'll parse this into a clean list.
results = summary_data.get('result', {})
# A robust way to parse, ensuring order and handling missing data
parsed_articles = []
for pmid in pmids:
if pmid in results:
article = results[pmid]
parsed_articles.append({
'uid': article.get('uid', pmid),
'title': article.get('title', 'Title Not Available'),
'pubdate': article.get('pubdate', 'N/A'),
'authors': [author['name'] for author in article.get('authors', [])],
'journal': article.get('source', 'N/A'),
'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
})
return parsed_articles
except aiohttp.ClientError as e:
print(f"An error occurred while fetching from PubMed: {e}")
return []
except Exception as e:
print(f"A general error occurred in the pubmed_client: {e}")
return [] |