# yyj import requests import xml.etree.ElementTree as ET import os from tqdm import tqdm import json import shutil from loguru import logger from lxml import etree class ArticleRetrieval: def __init__(self, keywords: list, repo_dir = 'repodir', retmax = 500): self.keywords = keywords self.repo_dir = repo_dir self.retmax = retmax ## 通过PMC数据库检索文章 def search_pmc(self): base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" params = { "db": "pmc", "term": '+'.join(self.keywords), "retmax": self.retmax } response = requests.get(base_url, params=params) root = ET.fromstring(response.content) pmc_ids = [id_element.text for id_element in root.findall('.//Id')] print(f"Found {len(pmc_ids)} articles.") self.pmc_ids = pmc_ids return pmc_ids # 解析XML文件 def _get_all_text(self, element): """递归获取XML元素及其所有子元素的文本内容。确保element不为None.""" if element is None: return "" text = element.text or "" for child in element: text += self._get_all_text(child) if child is not None and child.tail: text += child.tail return text ## 清洗XML文件 def _clean_xml(self,txt): parser = etree.XMLParser(recover=True) root = ET.fromstring(txt,parser=parser) txt = self._get_all_text(root) txt = txt.split('REFERENCES')[0] # 截取参考文献之前的文本 text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250]) return text ## 通过PMC数据库获取全文 def fetch_full_text(self): if not os.path.exists(self.repo_dir): os.makedirs(self.repo_dir) print(f"Saving articles to {self.repo_dir}.") for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"): base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" params = { "db": "pmc", "id": id, "rettype": "xml", "retmode": "text" } response = requests.get(base_url, params=params) full_text = self._clean_xml(response.text) with open(os.path.join(self.repo_dir,f'PMC{id}.txt'), 'w') as f: f.write(full_text) def save_config(self): config = { 'keywords': self.keywords, 'repo_dir': self.repo_dir, 'pmc_ids': self.pmc_ids, 'len': len(self.pmc_ids), 'retmax': self.retmax } with open(os.path.join(self.repo_dir, 'config.json'), 'w') as f: json.dump(config, f, indent=4, ensure_ascii=False) def initiallize(self): self.search_pmc() self.fetch_full_text() self.save_config() if __name__ == '__main__': if os.path.exists('repodir'): shutil.rmtree('repodir') articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5) articelfinder.initiallize()