ReadReview / huixiangdou /service /findarticles.py
Yijun-Yang's picture
findarticles
0573e7b
raw
history blame
3.24 kB
# yyj
import requests
import xml.etree.ElementTree as ET
import os
from tqdm import tqdm
import json
import shutil
from loguru import logger
from lxml import etree
class ArticleRetrieval:
def __init__(self,
keywords: list,
repo_dir = 'repodir',
retmax = 500):
self.keywords = keywords
self.repo_dir = repo_dir
self.retmax = retmax
## 通过PMC数据库检索文章
def search_pmc(self):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "pmc",
"term": '+'.join(self.keywords),
"retmax": self.retmax
}
response = requests.get(base_url, params=params)
root = ET.fromstring(response.content)
pmc_ids = [id_element.text for id_element in root.findall('.//Id')]
print(f"Found {len(pmc_ids)} articles.")
self.pmc_ids = pmc_ids
return pmc_ids
# 解析XML文件
def _get_all_text(self, element):
"""递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
if element is None:
return ""
text = element.text or ""
for child in element:
text += self._get_all_text(child)
if child is not None and child.tail:
text += child.tail
return text
## 清洗XML文件
def _clean_xml(self,txt):
parser = etree.XMLParser(recover=True)
root = ET.fromstring(txt,parser=parser)
txt = self._get_all_text(root)
txt = txt.split('REFERENCES')[0] # 截取参考文献之前的文本
text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
return text
## 通过PMC数据库获取全文
def fetch_full_text(self):
if not os.path.exists(self.repo_dir):
os.makedirs(self.repo_dir)
print(f"Saving articles to {self.repo_dir}.")
for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
"db": "pmc",
"id": id,
"rettype": "xml",
"retmode": "text"
}
response = requests.get(base_url, params=params)
full_text = self._clean_xml(response.text)
with open(os.path.join(self.repo_dir,f'PMC{id}.txt'), 'w') as f:
f.write(full_text)
def save_config(self):
config = {
'keywords': self.keywords,
'repo_dir': self.repo_dir,
'pmc_ids': self.pmc_ids,
'len': len(self.pmc_ids),
'retmax': self.retmax
}
with open(os.path.join(self.repo_dir, 'config.json'), 'w') as f:
json.dump(config, f, indent=4, ensure_ascii=False)
def initiallize(self):
self.search_pmc()
self.fetch_full_text()
self.save_config()
if __name__ == '__main__':
if os.path.exists('repodir'):
shutil.rmtree('repodir')
articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5)
articelfinder.initiallize()