Spaces:
Runtime error
Runtime error
import json | |
import logging | |
import time | |
import urllib.error | |
import urllib.parse | |
import urllib.request | |
from typing import Any, Dict, Iterator, List | |
from langchain_core.documents import Document | |
from langchain_core.pydantic_v1 import BaseModel, root_validator | |
logger = logging.getLogger(__name__) | |
class PubMedAPIWrapper(BaseModel): | |
""" | |
Wrapper around PubMed API. | |
This wrapper will use the PubMed API to conduct searches and fetch | |
document summaries. By default, it will return the document summaries | |
of the top-k results of an input search. | |
Parameters: | |
top_k_results: number of the top-scored document used for the PubMed tool | |
MAX_QUERY_LENGTH: maximum length of the query. | |
Default is 300 characters. | |
doc_content_chars_max: maximum length of the document content. | |
Content will be truncated if it exceeds this length. | |
Default is 2000 characters. | |
max_retry: maximum number of retries for a request. Default is 5. | |
sleep_time: time to wait between retries. | |
Default is 0.2 seconds. | |
email: email address to be used for the PubMed API. | |
""" | |
parse: Any #: :meta private: | |
base_url_esearch: str = ( | |
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" | |
) | |
base_url_efetch: str = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" | |
max_retry: int = 5 | |
sleep_time: float = 0.2 | |
# Default values for the parameters | |
top_k_results: int = 3 | |
MAX_QUERY_LENGTH: int = 300 | |
doc_content_chars_max: int = 2000 | |
email: str = "your_email@example.com" | |
def validate_environment(cls, values: Dict) -> Dict: | |
"""Validate that the python package exists in environment.""" | |
try: | |
import xmltodict | |
values["parse"] = xmltodict.parse | |
except ImportError: | |
raise ImportError( | |
"Could not import xmltodict python package. " | |
"Please install it with `pip install xmltodict`." | |
) | |
return values | |
def run(self, query: str) -> str: | |
""" | |
Run PubMed search and get the article meta information. | |
See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch | |
It uses only the most informative fields of article meta information. | |
""" | |
try: | |
# Retrieve the top-k results for the query | |
docs = [ | |
f"Published: {result['Published']}\n" | |
f"Title: {result['Title']}\n" | |
f"Copyright Information: {result['Copyright Information']}\n" | |
f"Summary::\n{result['Summary']}" | |
for result in self.load(query[: self.MAX_QUERY_LENGTH]) | |
] | |
# Join the results and limit the character count | |
return ( | |
"\n\n".join(docs)[: self.doc_content_chars_max] | |
if docs | |
else "No good PubMed Result was found" | |
) | |
except Exception as ex: | |
return f"PubMed exception: {ex}" | |
def lazy_load(self, query: str) -> Iterator[dict]: | |
""" | |
Search PubMed for documents matching the query. | |
Return an iterator of dictionaries containing the document metadata. | |
""" | |
url = ( | |
self.base_url_esearch | |
+ "db=pubmed&term=" | |
+ str({urllib.parse.quote(query)}) | |
+ f"&retmode=json&retmax={self.top_k_results}&usehistory=y" | |
) | |
result = urllib.request.urlopen(url) | |
text = result.read().decode("utf-8") | |
json_text = json.loads(text) | |
webenv = json_text["esearchresult"]["webenv"] | |
for uid in json_text["esearchresult"]["idlist"]: | |
yield self.retrieve_article(uid, webenv) | |
def load(self, query: str) -> List[dict]: | |
""" | |
Search PubMed for documents matching the query. | |
Return a list of dictionaries containing the document metadata. | |
""" | |
return list(self.lazy_load(query)) | |
def _dict2document(self, doc: dict) -> Document: | |
summary = doc.pop("Summary") | |
return Document(page_content=summary, metadata=doc) | |
def lazy_load_docs(self, query: str) -> Iterator[Document]: | |
for d in self.lazy_load(query=query): | |
yield self._dict2document(d) | |
def load_docs(self, query: str) -> List[Document]: | |
return list(self.lazy_load_docs(query=query)) | |
def retrieve_article(self, uid: str, webenv: str) -> dict: | |
url = ( | |
self.base_url_efetch | |
+ "db=pubmed&retmode=xml&id=" | |
+ uid | |
+ "&webenv=" | |
+ webenv | |
) | |
retry = 0 | |
while True: | |
try: | |
result = urllib.request.urlopen(url) | |
break | |
except urllib.error.HTTPError as e: | |
if e.code == 429 and retry < self.max_retry: | |
# Too Many Requests errors | |
# wait for an exponentially increasing amount of time | |
print( | |
f"Too Many Requests, " | |
f"waiting for {self.sleep_time:.2f} seconds..." | |
) | |
time.sleep(self.sleep_time) | |
self.sleep_time *= 2 | |
retry += 1 | |
else: | |
raise e | |
xml_text = result.read().decode("utf-8") | |
text_dict = self.parse(xml_text) | |
return self._parse_article(uid, text_dict) | |
def _parse_article(self, uid: str, text_dict: dict) -> dict: | |
try: | |
ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][ | |
"Article" | |
] | |
except KeyError: | |
ar = text_dict["PubmedArticleSet"]["PubmedBookArticle"]["BookDocument"] | |
abstract_text = ar.get("Abstract", {}).get("AbstractText", []) | |
summaries = [ | |
f"{txt['@Label']}: {txt['#text']}" | |
for txt in abstract_text | |
if "#text" in txt and "@Label" in txt | |
] | |
summary = ( | |
"\n".join(summaries) | |
if summaries | |
else ( | |
abstract_text | |
if isinstance(abstract_text, str) | |
else ( | |
"\n".join(str(value) for value in abstract_text.values()) | |
if isinstance(abstract_text, dict) | |
else "No abstract available" | |
) | |
) | |
) | |
a_d = ar.get("ArticleDate", {}) | |
pub_date = "-".join( | |
[a_d.get("Year", ""), a_d.get("Month", ""), a_d.get("Day", "")] | |
) | |
return { | |
"uid": uid, | |
"Title": ar.get("ArticleTitle", ""), | |
"Published": pub_date, | |
"Copyright Information": ar.get("Abstract", {}).get( | |
"CopyrightInformation", "" | |
), | |
"Summary": summary, | |
} | |