File size: 3,237 Bytes
7a919c0
 
 
 
 
 
 
73855f3
0573e7b
 
7a919c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0573e7b
 
 
 
 
 
 
 
 
 
 
7a919c0
 
 
0573e7b
 
7a919c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0573e7b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# yyj
import requests
import xml.etree.ElementTree as ET
import os 
from tqdm import tqdm
import json
import shutil
from loguru import logger
from lxml import etree

class ArticleRetrieval:
    def __init__(self,
                    keywords: list,
                    repo_dir = 'repodir',
                    retmax = 500):
        self.keywords = keywords
        self.repo_dir = repo_dir
        self.retmax = retmax
        
    ## 通过PMC数据库检索文章
    def search_pmc(self):

        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        params = {
            "db": "pmc",
            "term": '+'.join(self.keywords),
            "retmax": self.retmax
        }
        response = requests.get(base_url, params=params)
        root = ET.fromstring(response.content)
        pmc_ids = [id_element.text for id_element in root.findall('.//Id')]
        print(f"Found {len(pmc_ids)} articles.")
        self.pmc_ids = pmc_ids
        return pmc_ids

    # 解析XML文件
def _get_all_text(self, element):
    """递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
    if element is None:
        return ""
    
    text = element.text or ""
    for child in element:
        text += self._get_all_text(child)
        if child is not None and child.tail:
            text += child.tail
    return text

    ## 清洗XML文件
    def _clean_xml(self,txt):
        parser = etree.XMLParser(recover=True)
        root = ET.fromstring(txt,parser=parser)
        txt = self._get_all_text(root)
        txt = txt.split('REFERENCES')[0]  # 截取参考文献之前的文本
        text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
        return text

    ## 通过PMC数据库获取全文
    def fetch_full_text(self):
        if not os.path.exists(self.repo_dir):
            os.makedirs(self.repo_dir)
        print(f"Saving articles to {self.repo_dir}.")
        
        for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"):
            base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
            params = {
                "db": "pmc",
                "id": id,
                "rettype": "xml",
                "retmode": "text"
            }
            response = requests.get(base_url, params=params)
            full_text = self._clean_xml(response.text)
            with open(os.path.join(self.repo_dir,f'PMC{id}.txt'), 'w') as f:
                f.write(full_text)

    def save_config(self):
        config = {
            'keywords': self.keywords,
            'repo_dir': self.repo_dir,
            'pmc_ids': self.pmc_ids,
            'len': len(self.pmc_ids),
            'retmax': self.retmax
        }
        with open(os.path.join(self.repo_dir, 'config.json'), 'w') as f:
            json.dump(config, f, indent=4, ensure_ascii=False)

    def initiallize(self):
        self.search_pmc()
        self.fetch_full_text()
        self.save_config()

if __name__ == '__main__':
    if os.path.exists('repodir'):
        shutil.rmtree('repodir')
    articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5)
    articelfinder.initiallize()