Spaces:

zhangyi617
/

webui

Runtime error

App Files Files Community

webui / langchain /utilities /pubmed.py

zhangyi617

Upload folder using huggingface_hub

129cd69 over 1 year ago

raw

history blame contribute delete

6.94 kB

	import json
	import logging
	import time
	import urllib.error
	import urllib.parse
	import urllib.request
	from typing import Any, Dict, Iterator, List

	from langchain_core.documents import Document
	from langchain_core.pydantic_v1 import BaseModel, root_validator

	logger = logging.getLogger(__name__)


	class PubMedAPIWrapper(BaseModel):
	"""
	Wrapper around PubMed API.

	This wrapper will use the PubMed API to conduct searches and fetch
	document summaries. By default, it will return the document summaries
	of the top-k results of an input search.

	Parameters:
	top_k_results: number of the top-scored document used for the PubMed tool
	MAX_QUERY_LENGTH: maximum length of the query.
	Default is 300 characters.
	doc_content_chars_max: maximum length of the document content.
	Content will be truncated if it exceeds this length.
	Default is 2000 characters.
	max_retry: maximum number of retries for a request. Default is 5.
	sleep_time: time to wait between retries.
	Default is 0.2 seconds.
	email: email address to be used for the PubMed API.
	"""

	parse: Any #: :meta private:

	base_url_esearch: str = (
	"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
	)
	base_url_efetch: str = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
	max_retry: int = 5
	sleep_time: float = 0.2

	# Default values for the parameters
	top_k_results: int = 3
	MAX_QUERY_LENGTH: int = 300
	doc_content_chars_max: int = 2000
	email: str = "your_email@example.com"

	@root_validator()
	def validate_environment(cls, values: Dict) -> Dict:
	"""Validate that the python package exists in environment."""
	try:
	import xmltodict

	values["parse"] = xmltodict.parse
	except ImportError:
	raise ImportError(
	"Could not import xmltodict python package. "
	"Please install it with `pip install xmltodict`."
	)
	return values

	def run(self, query: str) -> str:
	"""
	Run PubMed search and get the article meta information.
	See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
	It uses only the most informative fields of article meta information.
	"""

	try:
	# Retrieve the top-k results for the query
	docs = [
	f"Published: {result['Published']}\n"
	f"Title: {result['Title']}\n"
	f"Copyright Information: {result['Copyright Information']}\n"
	f"Summary::\n{result['Summary']}"
	for result in self.load(query[: self.MAX_QUERY_LENGTH])
	]

	# Join the results and limit the character count
	return (
	"\n\n".join(docs)[: self.doc_content_chars_max]
	if docs
	else "No good PubMed Result was found"
	)
	except Exception as ex:
	return f"PubMed exception: {ex}"

	def lazy_load(self, query: str) -> Iterator[dict]:
	"""
	Search PubMed for documents matching the query.
	Return an iterator of dictionaries containing the document metadata.
	"""

	url = (
	self.base_url_esearch
	+ "db=pubmed&term="
	+ str({urllib.parse.quote(query)})
	+ f"&retmode=json&retmax={self.top_k_results}&usehistory=y"
	)
	result = urllib.request.urlopen(url)
	text = result.read().decode("utf-8")
	json_text = json.loads(text)

	webenv = json_text["esearchresult"]["webenv"]
	for uid in json_text["esearchresult"]["idlist"]:
	yield self.retrieve_article(uid, webenv)

	def load(self, query: str) -> List[dict]:
	"""
	Search PubMed for documents matching the query.
	Return a list of dictionaries containing the document metadata.
	"""
	return list(self.lazy_load(query))

	def _dict2document(self, doc: dict) -> Document:
	summary = doc.pop("Summary")
	return Document(page_content=summary, metadata=doc)

	def lazy_load_docs(self, query: str) -> Iterator[Document]:
	for d in self.lazy_load(query=query):
	yield self._dict2document(d)

	def load_docs(self, query: str) -> List[Document]:
	return list(self.lazy_load_docs(query=query))

	def retrieve_article(self, uid: str, webenv: str) -> dict:
	url = (
	self.base_url_efetch
	+ "db=pubmed&retmode=xml&id="
	+ uid
	+ "&webenv="
	+ webenv
	)

	retry = 0
	while True:
	try:
	result = urllib.request.urlopen(url)
	break
	except urllib.error.HTTPError as e:
	if e.code == 429 and retry < self.max_retry:
	# Too Many Requests errors
	# wait for an exponentially increasing amount of time
	print(
	f"Too Many Requests, "
	f"waiting for {self.sleep_time:.2f} seconds..."
	)
	time.sleep(self.sleep_time)
	self.sleep_time *= 2
	retry += 1
	else:
	raise e

	xml_text = result.read().decode("utf-8")
	text_dict = self.parse(xml_text)
	return self._parse_article(uid, text_dict)

	def _parse_article(self, uid: str, text_dict: dict) -> dict:
	try:
	ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][
	"Article"
	]
	except KeyError:
	ar = text_dict["PubmedArticleSet"]["PubmedBookArticle"]["BookDocument"]
	abstract_text = ar.get("Abstract", {}).get("AbstractText", [])
	summaries = [
	f"{txt['@Label']}: {txt['#text']}"
	for txt in abstract_text
	if "#text" in txt and "@Label" in txt
	]
	summary = (
	"\n".join(summaries)
	if summaries
	else (
	abstract_text
	if isinstance(abstract_text, str)
	else (
	"\n".join(str(value) for value in abstract_text.values())
	if isinstance(abstract_text, dict)
	else "No abstract available"
	)
	)
	)
	a_d = ar.get("ArticleDate", {})
	pub_date = "-".join(
	[a_d.get("Year", ""), a_d.get("Month", ""), a_d.get("Day", "")]
	)

	return {
	"uid": uid,
	"Title": ar.get("ArticleTitle", ""),
	"Published": pub_date,
	"Copyright Information": ar.get("Abstract", {}).get(
	"CopyrightInformation", ""
	),
	"Summary": summary,
	}