ReadReview / huixiangdou /service /findarticles.py
Yijun-Yang's picture
updategradiofrontend
92bcd1d
raw
history blame
No virus
9.1 kB
# yyj
import requests
import xml.etree.ElementTree as ET
import os
from tqdm import tqdm
import json
import shutil
from loguru import logger
from lxml import etree
import requests
from bs4 import BeautifulSoup
import os
def download_pdfs(path, doi_list): #fox dalao contribution https://github.com/BigWhiteFox
# 确保下载目录存在
if not os.path.exists(path):
os.makedirs(path)
if isinstance(doi_list, str):
doi_list = [doi_list]
href_list = []
for doi in doi_list:
url = f"https://sci-hub.se/{doi}"
response = requests.get(url)
# 检查请求是否成功
if response.status_code == 200:
print(f"成功请求:{url}")
else:
print(f"请求失败:{url},状态码:{response.status_code}")
continue # 如果请求失败,跳过本次循环
soup = BeautifulSoup(response.text, 'html.parser')
buttons = soup.find_all('button', onclick=True)
for button in buttons:
onclick = button.get('onclick')
if onclick:
pdf_url = onclick.split("'")[1]
href_list.append((pdf_url, doi))
print("pdf_url:", pdf_url)
print("href_list:", href_list)
# 遍历href_list中的每个URL
for href, doi in href_list:
pdf_url = f"https:{href}"
try:
response = requests.get(pdf_url, stream=True)
if response.status_code == 200:
filename = doi.replace("/", "_") + ".pdf"
file_path = os.path.join(path, filename)
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"File downloaded and saved as: {file_path}")
else:
print(f"Download failed, Status Code: {response.status_code}, URL: {pdf_url}")
except requests.RequestException as e:
print(f"Failed to download due to an exception: {e}")
class ArticleRetrieval:
def __init__(self,
keywords: list = [],
pmids: list = [],
repo_dir = 'repodir',
retmax = 500):
if keywords is [] and pmids is []:
raise ValueError("Either keywords or pmids must be provided.")
self.keywords = keywords
self.pmids = pmids
self.repo_dir = repo_dir
self.retmax = retmax
self.pmc_ids = []
def esummary_pmc(self):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?"
params = {
"db": "pubmed",
"id": ','.join(self.pmids),
# "retmax": self.retmax
}
response = requests.get(base_url, params=params)
root = ET.fromstring(response.content)
results = []
for docsum in root.findall('DocSum'):
pmcid = None
doi = None
id_value = docsum.find('Id').text
for item in docsum.findall('.//Item[@Name="doi"]'):
doi = item.text
break
for item in docsum.findall('.//Item[@Name="pmc"]'):
pmcid = item.text
break
results.append((id_value, pmcid, doi))
logger.info(f"total {len(results)} articles:")
logger.info(f"found {len([r for r in results if r[1] is not None])} articles with PMC ID.")
logger.info(f"found {len([r for r in results if r[2] is not None])} articles with DOI.")
logger.info(f"found {len([r for r in results if r[1] is None and r[2] is None])} articles without PMC ID and DOI.")
self.esummary = results
self.pmc_ids = [r[1] for r in results if r[1] is not None]
self.scihub_doi = [r[2] for r in results if r[1] is None and r[2] is not None]
self.failed_pmids = [r[0] for r in results if r[1] is None and r[2] is None]
## 通过Pubmed数据库检索文章
def esearch_pmc(self):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "pubmed",
"term": '+'.join(self.keywords),
"retmax": self.retmax
}
response = requests.get(base_url, params=params)
root = ET.fromstring(response.content)
idlist = root.find('.//IdList')
try:
pmids = [id_element.text for id_element in idlist.findall('.//Id')]
except:
pmids = []
print(f"Found {len(pmids)} articles for keywords {self.keywords}.")
self.search_pmid = pmids
self.pmids.extend(pmids)
# 解析XML文件
def _get_all_text(self, element):
"""递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
if element is None:
return ""
text = element.text or ""
for child in element:
text += self._get_all_text(child)
if child is not None and child.tail:
text += child.tail
return text
## 清洗XML文件
def _clean_xml(self,txt):
parser = etree.XMLParser(recover=True)
root = ET.fromstring(txt,parser=parser)
txt = self._get_all_text(root)
txt = txt.split('REFERENCES')[0] # 截取参考文献之前的文本
text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
return text
## 通过PMC数据库获取全文
def fetch_full_text(self):
if not os.path.exists(self.repo_dir):
os.makedirs(self.repo_dir)
print(f"Saving articles to {self.repo_dir}.")
self.pmc_success = 0
self.scihub_success = 0
self.failed_download = []
downloaded = os.listdir(self.repo_dir)
for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"):
# check if file already downloaded
if f"{id}.txt" in downloaded:
print(f"File already downloaded: {id}")
self.pmc_success += 1
continue
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
"db": "pmc",
"id": id,
"rettype": "xml",
"retmode": "text"
}
response = requests.get(base_url, params=params)
full_text = self._clean_xml(response.text)
if full_text.strip() == '':
self.failed_download.append(id)
continue
else:
logger.info(full_text[:200])
with open(os.path.join(self.repo_dir,f'{id}.txt'), 'w') as f:
f.write(full_text)
self.pmc_success += 1
for doi in tqdm(self.scihub_doi, desc="Fetching full texts", unit="article"):
# check if file already downloaded
if f"{doi.replace('/','_')}.pdf" in downloaded:
print(f"File already downloaded: {doi}")
self.scihub_success += 1
continue
if download_pdfs(path=self.repo_dir,doi_list = doi):
self.scihub_success += 1
else:
self.failed_download.append(doi)
def save_config(self):
config = {
'repo_dir': self.repo_dir,
'keywords': self.keywords,
'retmax': self.retmax,
"search_pmids": self.search_pmid,
'import_pmids': [id for id in self.pmids if id not in self.search_pmid],
'failed_pmids': self.failed_pmids,
'result': [
{
'pmid': r[0],
'pmcid': r[1],
'doi': r[2]
} for r in self.esummary
],
"pmc_success_d": self.pmc_success,
"scihub_success_d": self.scihub_success,
"failed_download": self.failed_download,
}
with open(os.path.join(self.repo_dir, 'info.json'), 'w') as f:
json.dump(config, f, indent=4, ensure_ascii=False)
def initiallize(self):
if self.keywords !=[]:
print(self.keywords)
self.esearch_pmc() # get pmids from pubmed database using keywords
self.esummary_pmc() # get pmc ids from pubmed database using pmids
self.fetch_full_text() # get full text from pmc database using pmc ids
self.save_config() # save config file
if __name__ == '__main__':
if os.path.exists('repodir'):
shutil.rmtree('repodir')
strings = """
34536239
7760895
36109602
24766875"""
string = [k.strip() for k in strings.split('\n')]
pmids = [k for k in string if k.isdigit()]
print(pmids)
keys = [k for k in string if not k.isdigit() and k != '']
print(keys)
articelfinder = ArticleRetrieval(keywords = keys,pmids = pmids,
repo_dir = 'repodir',retmax = 5)
articelfinder.initiallize()