Spaces:

mgbam
/

project-asclepius

Running

App Files Files Community

project-asclepius / api_clients /pubmed_client.py

mgbam

Update api_clients/pubmed_client.py

a0d8a91 verified 4 days ago

raw

history blame contribute delete

4.84 kB

	# api_clients/pubmed_client.py
	"""
	Client for the PubMed API via NCBI's Entrez E-utilities.
	This module is expertly crafted to perform a two-step search: first finding
	relevant article IDs (PMIDs) and then fetching their structured summaries.
	It intelligently prioritizes review articles to provide high-quality,
	synthesized information to the main orchestrator.
	"""
	import aiohttp
	from .config import PUBMED_BASE_URL, REQUEST_HEADERS

	async def search_pubmed(session: aiohttp.ClientSession, query: str, max_results: int = 5) -> list[dict]:
	"""
	Searches PubMed and returns a list of article summaries.

	This function implements an intelligent search strategy:
	1. It searches for article IDs (PMIDs) matching the query within the title/abstract.
	2. It specifically filters for "review" articles, which are ideal for summarization.
	3. It then fetches concise summaries for the found PMIDs.

	Args:
	session (aiohttp.ClientSession): The active HTTP session.
	query (str): The search term, likely a combination of concepts (e.g., "Migraine AND Aura").
	max_results (int): The maximum number of article summaries to return.

	Returns:
	list[dict]: A list of dictionaries, each containing summary data for an article.
	Returns an empty list if no results are found or an error occurs.
	"""
	if not query:
	return []

	# --- Step 1: ESearch - Find relevant article PMIDs ---
	# We construct a powerful query to get the most relevant results.
	# - `[Title/Abstract]`: Focuses the search on the most important parts of the paper.
	# - `AND review[Publication Type]`: Narrows results to high-value review articles.
	# - `sort=relevance`: Ensures the best matches appear first.
	search_term = f"({query}) AND review[Publication Type]"

	esearch_params = {
	'db': 'pubmed',
	'term': search_term,
	'retmode': 'json',
	'retmax': max_results,
	'sort': 'relevance'
	}
	esearch_url = f"{PUBMED_BASE_URL}/esearch.fcgi"

	pmids = []
	try:
	async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS, timeout=10) as resp:
	resp.raise_for_status()
	data = await resp.json()
	pmids = data.get('esearchresult', {}).get('idlist', [])

	if not pmids:
	# If no review articles are found, try a broader search as a fallback
	print(f"No review articles found for '{query}'. Broadening search...")
	esearch_params['term'] = query # Remove the review filter
	async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS) as fallback_resp:
	fallback_resp.raise_for_status()
	fallback_data = await fallback_resp.json()
	pmids = fallback_data.get('esearchresult', {}).get('idlist', [])

	if not pmids:
	print(f"No PubMed results found for query: {query}")
	return []

	# --- Step 2: ESummary - Fetch summaries for the found PMIDs ---
	esummary_params = {
	'db': 'pubmed',
	'id': ",".join(pmids), # E-utilities can take a comma-separated list of IDs
	'retmode': 'json'
	}
	esummary_url = f"{PUBMED_BASE_URL}/esummary.fcgi"

	async with session.get(esummary_url, params=esummary_params, headers=REQUEST_HEADERS, timeout=15) as resp:
	resp.raise_for_status()
	summary_data = await resp.json()

	# The result is a dict with a 'result' key, which contains another dict
	# where keys are the PMIDs. We'll parse this into a clean list.
	results = summary_data.get('result', {})

	# A robust way to parse, ensuring order and handling missing data
	parsed_articles = []
	for pmid in pmids:
	if pmid in results:
	article = results[pmid]
	parsed_articles.append({
	'uid': article.get('uid', pmid),
	'title': article.get('title', 'Title Not Available'),
	'pubdate': article.get('pubdate', 'N/A'),
	'authors': [author['name'] for author in article.get('authors', [])],
	'journal': article.get('source', 'N/A'),
	'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
	})
	return parsed_articles

	except aiohttp.ClientError as e:
	print(f"An error occurred while fetching from PubMed: {e}")
	return []
	except Exception as e:
	print(f"A general error occurred in the pubmed_client: {e}")
	return []