Spaces:
Running
Running
# api_clients/pubmed_client.py | |
""" | |
Client for the PubMed API via NCBI's Entrez E-utilities. | |
This module is expertly crafted to perform a two-step search: first finding | |
relevant article IDs (PMIDs) and then fetching their structured summaries. | |
It intelligently prioritizes review articles to provide high-quality, | |
synthesized information to the main orchestrator. | |
""" | |
import aiohttp | |
from .config import PUBMED_BASE_URL, REQUEST_HEADERS | |
async def search_pubmed(session: aiohttp.ClientSession, query: str, max_results: int = 5) -> list[dict]: | |
""" | |
Searches PubMed and returns a list of article summaries. | |
This function implements an intelligent search strategy: | |
1. It searches for article IDs (PMIDs) matching the query within the title/abstract. | |
2. It specifically filters for "review" articles, which are ideal for summarization. | |
3. It then fetches concise summaries for the found PMIDs. | |
Args: | |
session (aiohttp.ClientSession): The active HTTP session. | |
query (str): The search term, likely a combination of concepts (e.g., "Migraine AND Aura"). | |
max_results (int): The maximum number of article summaries to return. | |
Returns: | |
list[dict]: A list of dictionaries, each containing summary data for an article. | |
Returns an empty list if no results are found or an error occurs. | |
""" | |
if not query: | |
return [] | |
# --- Step 1: ESearch - Find relevant article PMIDs --- | |
# We construct a powerful query to get the most relevant results. | |
# - `[Title/Abstract]`: Focuses the search on the most important parts of the paper. | |
# - `AND review[Publication Type]`: Narrows results to high-value review articles. | |
# - `sort=relevance`: Ensures the best matches appear first. | |
search_term = f"({query}) AND review[Publication Type]" | |
esearch_params = { | |
'db': 'pubmed', | |
'term': search_term, | |
'retmode': 'json', | |
'retmax': max_results, | |
'sort': 'relevance' | |
} | |
esearch_url = f"{PUBMED_BASE_URL}/esearch.fcgi" | |
pmids = [] | |
try: | |
async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS, timeout=10) as resp: | |
resp.raise_for_status() | |
data = await resp.json() | |
pmids = data.get('esearchresult', {}).get('idlist', []) | |
if not pmids: | |
# If no review articles are found, try a broader search as a fallback | |
print(f"No review articles found for '{query}'. Broadening search...") | |
esearch_params['term'] = query # Remove the review filter | |
async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS) as fallback_resp: | |
fallback_resp.raise_for_status() | |
fallback_data = await fallback_resp.json() | |
pmids = fallback_data.get('esearchresult', {}).get('idlist', []) | |
if not pmids: | |
print(f"No PubMed results found for query: {query}") | |
return [] | |
# --- Step 2: ESummary - Fetch summaries for the found PMIDs --- | |
esummary_params = { | |
'db': 'pubmed', | |
'id': ",".join(pmids), # E-utilities can take a comma-separated list of IDs | |
'retmode': 'json' | |
} | |
esummary_url = f"{PUBMED_BASE_URL}/esummary.fcgi" | |
async with session.get(esummary_url, params=esummary_params, headers=REQUEST_HEADERS, timeout=15) as resp: | |
resp.raise_for_status() | |
summary_data = await resp.json() | |
# The result is a dict with a 'result' key, which contains another dict | |
# where keys are the PMIDs. We'll parse this into a clean list. | |
results = summary_data.get('result', {}) | |
# A robust way to parse, ensuring order and handling missing data | |
parsed_articles = [] | |
for pmid in pmids: | |
if pmid in results: | |
article = results[pmid] | |
parsed_articles.append({ | |
'uid': article.get('uid', pmid), | |
'title': article.get('title', 'Title Not Available'), | |
'pubdate': article.get('pubdate', 'N/A'), | |
'authors': [author['name'] for author in article.get('authors', [])], | |
'journal': article.get('source', 'N/A'), | |
'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" | |
}) | |
return parsed_articles | |
except aiohttp.ClientError as e: | |
print(f"An error occurred while fetching from PubMed: {e}") | |
return [] | |
except Exception as e: | |
print(f"A general error occurred in the pubmed_client: {e}") | |
return [] |