Spaces:

Yijun-Yang
/

ReadReview

Running on Zero

App Files Files Community

ReadReview / huixiangdou /service /findarticles.py

Yijun-Yang

updategradiofrontend

92bcd1d 4 months ago

raw

history blame

No virus

9.1 kB

	# yyj
	import requests
	import xml.etree.ElementTree as ET
	import os
	from tqdm import tqdm
	import json
	import shutil
	from loguru import logger
	from lxml import etree
	import requests
	from bs4 import BeautifulSoup
	import os

	def download_pdfs(path, doi_list): #fox dalao contribution https://github.com/BigWhiteFox
	# 确保下载目录存在
	if not os.path.exists(path):
	os.makedirs(path)
	if isinstance(doi_list, str):
	doi_list = [doi_list]
	href_list = []

	for doi in doi_list:
	url = f"https://sci-hub.se/{doi}"
	response = requests.get(url)

	# 检查请求是否成功
	if response.status_code == 200:
	print(f"成功请求：{url}")
	else:
	print(f"请求失败：{url}，状态码：{response.status_code}")
	continue # 如果请求失败，跳过本次循环

	soup = BeautifulSoup(response.text, 'html.parser')
	buttons = soup.find_all('button', onclick=True)

	for button in buttons:
	onclick = button.get('onclick')
	if onclick:
	pdf_url = onclick.split("'")[1]
	href_list.append((pdf_url, doi))
	print("pdf_url:", pdf_url)
	print("href_list:", href_list)

	# 遍历href_list中的每个URL
	for href, doi in href_list:
	pdf_url = f"https:{href}"
	try:
	response = requests.get(pdf_url, stream=True)
	if response.status_code == 200:
	filename = doi.replace("/", "_") + ".pdf"
	file_path = os.path.join(path, filename)
	with open(file_path, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)
	print(f"File downloaded and saved as: {file_path}")
	else:
	print(f"Download failed, Status Code: {response.status_code}, URL: {pdf_url}")
	except requests.RequestException as e:
	print(f"Failed to download due to an exception: {e}")


	class ArticleRetrieval:
	def __init__(self,
	keywords: list = [],
	pmids: list = [],
	repo_dir = 'repodir',
	retmax = 500):
	if keywords is [] and pmids is []:
	raise ValueError("Either keywords or pmids must be provided.")

	self.keywords = keywords
	self.pmids = pmids
	self.repo_dir = repo_dir
	self.retmax = retmax
	self.pmc_ids = []


	def esummary_pmc(self):
	base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?"
	params = {
	"db": "pubmed",
	"id": ','.join(self.pmids),
	# "retmax": self.retmax
	}
	response = requests.get(base_url, params=params)
	root = ET.fromstring(response.content)
	results = []
	for docsum in root.findall('DocSum'):
	pmcid = None
	doi = None
	id_value = docsum.find('Id').text
	for item in docsum.findall('.//Item[@Name="doi"]'):
	doi = item.text
	break
	for item in docsum.findall('.//Item[@Name="pmc"]'):
	pmcid = item.text
	break

	results.append((id_value, pmcid, doi))

	logger.info(f"total {len(results)} articles:")
	logger.info(f"found {len([r for r in results if r[1] is not None])} articles with PMC ID.")
	logger.info(f"found {len([r for r in results if r[2] is not None])} articles with DOI.")
	logger.info(f"found {len([r for r in results if r[1] is None and r[2] is None])} articles without PMC ID and DOI.")

	self.esummary = results
	self.pmc_ids = [r[1] for r in results if r[1] is not None]
	self.scihub_doi = [r[2] for r in results if r[1] is None and r[2] is not None]
	self.failed_pmids = [r[0] for r in results if r[1] is None and r[2] is None]

	## 通过Pubmed数据库检索文章
	def esearch_pmc(self):

	base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
	params = {
	"db": "pubmed",
	"term": '+'.join(self.keywords),
	"retmax": self.retmax
	}
	response = requests.get(base_url, params=params)
	root = ET.fromstring(response.content)
	idlist = root.find('.//IdList')
	try:
	pmids = [id_element.text for id_element in idlist.findall('.//Id')]
	except:
	pmids = []

	print(f"Found {len(pmids)} articles for keywords {self.keywords}.")
	self.search_pmid = pmids
	self.pmids.extend(pmids)

	# 解析XML文件
	def _get_all_text(self, element):
	"""递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
	if element is None:
	return ""

	text = element.text or ""
	for child in element:
	text += self._get_all_text(child)
	if child is not None and child.tail:
	text += child.tail
	return text

	## 清洗XML文件
	def _clean_xml(self,txt):
	parser = etree.XMLParser(recover=True)
	root = ET.fromstring(txt,parser=parser)
	txt = self._get_all_text(root)
	txt = txt.split('REFERENCES')[0] # 截取参考文献之前的文本
	text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
	return text

	## 通过PMC数据库获取全文
	def fetch_full_text(self):
	if not os.path.exists(self.repo_dir):
	os.makedirs(self.repo_dir)
	print(f"Saving articles to {self.repo_dir}.")
	self.pmc_success = 0
	self.scihub_success = 0
	self.failed_download = []
	downloaded = os.listdir(self.repo_dir)
	for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"):
	# check if file already downloaded
	if f"{id}.txt" in downloaded:
	print(f"File already downloaded: {id}")
	self.pmc_success += 1
	continue
	base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
	params = {
	"db": "pmc",
	"id": id,
	"rettype": "xml",
	"retmode": "text"
	}
	response = requests.get(base_url, params=params)
	full_text = self._clean_xml(response.text)
	if full_text.strip() == '':
	self.failed_download.append(id)
	continue
	else:
	logger.info(full_text[:200])
	with open(os.path.join(self.repo_dir,f'{id}.txt'), 'w') as f:
	f.write(full_text)
	self.pmc_success += 1
	for doi in tqdm(self.scihub_doi, desc="Fetching full texts", unit="article"):
	# check if file already downloaded
	if f"{doi.replace('/','_')}.pdf" in downloaded:
	print(f"File already downloaded: {doi}")
	self.scihub_success += 1
	continue

	if download_pdfs(path=self.repo_dir,doi_list = doi):
	self.scihub_success += 1
	else:
	self.failed_download.append(doi)

	def save_config(self):
	config = {
	'repo_dir': self.repo_dir,
	'keywords': self.keywords,
	'retmax': self.retmax,
	"search_pmids": self.search_pmid,
	'import_pmids': [id for id in self.pmids if id not in self.search_pmid],
	'failed_pmids': self.failed_pmids,
	'result': [
	{
	'pmid': r[0],
	'pmcid': r[1],
	'doi': r[2]
	} for r in self.esummary
	],
	"pmc_success_d": self.pmc_success,
	"scihub_success_d": self.scihub_success,
	"failed_download": self.failed_download,

	}
	with open(os.path.join(self.repo_dir, 'info.json'), 'w') as f:
	json.dump(config, f, indent=4, ensure_ascii=False)

	def initiallize(self):
	if self.keywords !=[]:
	print(self.keywords)
	self.esearch_pmc() # get pmids from pubmed database using keywords

	self.esummary_pmc() # get pmc ids from pubmed database using pmids
	self.fetch_full_text() # get full text from pmc database using pmc ids
	self.save_config() # save config file

	if __name__ == '__main__':
	if os.path.exists('repodir'):
	shutil.rmtree('repodir')

	strings = """
	34536239
	7760895
	36109602
	24766875"""
	string = [k.strip() for k in strings.split('\n')]

	pmids = [k for k in string if k.isdigit()]
	print(pmids)
	keys = [k for k in string if not k.isdigit() and k != '']
	print(keys)
	articelfinder = ArticleRetrieval(keywords = keys,pmids = pmids,
	repo_dir = 'repodir',retmax = 5)
	articelfinder.initiallize()