Yijun-Yang commited on
Commit
0573e7b
1 Parent(s): 73855f3

findarticles

Browse files
Files changed (1) hide show
  1. huixiangdou/service/findarticles.py +16 -12
huixiangdou/service/findarticles.py CHANGED
@@ -6,6 +6,8 @@ from tqdm import tqdm
6
  import json
7
  import shutil
8
  from loguru import logger
 
 
9
  class ArticleRetrieval:
10
  def __init__(self,
11
  keywords: list,
@@ -32,19 +34,22 @@ class ArticleRetrieval:
32
  return pmc_ids
33
 
34
  # 解析XML文件
35
- def _get_all_text(self,element):
36
- """递归获取XML元素及其所有子元素的文本内容"""
37
- text = element.text or ""
38
- for child in element:
39
- text += self._get_all_text(child)
40
- if child.tail:
41
- text += child.tail
42
- return text
 
 
 
43
 
44
  ## 清洗XML文件
45
  def _clean_xml(self,txt):
46
- logger.error(text[:100])
47
- root = ET.fromstring(txt)
48
  txt = self._get_all_text(root)
49
  txt = txt.split('REFERENCES')[0] # 截取参考文献之前的文本
50
  text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
@@ -89,5 +94,4 @@ if __name__ == '__main__':
89
  if os.path.exists('repodir'):
90
  shutil.rmtree('repodir')
91
  articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5)
92
- pmc_ids = articelfinder.search_pmc()
93
- articelfinder.fetch_full_text(pmc_ids)
 
6
  import json
7
  import shutil
8
  from loguru import logger
9
+ from lxml import etree
10
+
11
  class ArticleRetrieval:
12
  def __init__(self,
13
  keywords: list,
 
34
  return pmc_ids
35
 
36
  # 解析XML文件
37
+ def _get_all_text(self, element):
38
+ """递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
39
+ if element is None:
40
+ return ""
41
+
42
+ text = element.text or ""
43
+ for child in element:
44
+ text += self._get_all_text(child)
45
+ if child is not None and child.tail:
46
+ text += child.tail
47
+ return text
48
 
49
  ## 清洗XML文件
50
  def _clean_xml(self,txt):
51
+ parser = etree.XMLParser(recover=True)
52
+ root = ET.fromstring(txt,parser=parser)
53
  txt = self._get_all_text(root)
54
  txt = txt.split('REFERENCES')[0] # 截取参考文献之前的文本
55
  text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
 
94
  if os.path.exists('repodir'):
95
  shutil.rmtree('repodir')
96
  articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5)
97
+ articelfinder.initiallize()