Spaces:

Yijun-Yang
/

ReadReview

Sleeping

Yijun-Yang commited on Jun 5

Commit

0573e7b

•

1 Parent(s): 73855f3

findarticles

Files changed (1) hide show

huixiangdou/service/findarticles.py CHANGED Viewed

@@ -6,6 +6,8 @@ from tqdm import tqdm
 import json
 import shutil
 from loguru import logger
 class ArticleRetrieval:
     def __init__(self,
                     keywords: list,
@@ -32,19 +34,22 @@ class ArticleRetrieval:
         return pmc_ids
     # 解析XML文件
-    def _get_all_text(self,element):
-        """递归获取XML元素及其所有子元素的文本内容"""
-        text = element.text or ""
-        for child in element:
-            text += self._get_all_text(child)
-            if child.tail:
-                text += child.tail
-        return text
     ## 清洗XML文件
     def _clean_xml(self,txt):
-        logger.error(text[:100])
-        root = ET.fromstring(txt)
         txt = self._get_all_text(root)
         txt = txt.split('REFERENCES')[0]  # 截取参考文献之前的文本
         text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
@@ -89,5 +94,4 @@ if __name__ == '__main__':
     if os.path.exists('repodir'):
         shutil.rmtree('repodir')
     articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5)
-    pmc_ids = articelfinder.search_pmc()
-    articelfinder.fetch_full_text(pmc_ids)

 import json
 import shutil
 from loguru import logger
+from lxml import etree
 class ArticleRetrieval:
     def __init__(self,
                     keywords: list,
         return pmc_ids
     # 解析XML文件
+def _get_all_text(self, element):
+    """递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
+    if element is None:
+        return ""
+    text = element.text or ""
+    for child in element:
+        text += self._get_all_text(child)
+        if child is not None and child.tail:
+            text += child.tail
+    return text
     ## 清洗XML文件
     def _clean_xml(self,txt):
+        parser = etree.XMLParser(recover=True)
+        root = ET.fromstring(txt,parser=parser)
         txt = self._get_all_text(root)
         txt = txt.split('REFERENCES')[0]  # 截取参考文献之前的文本
         text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
     if os.path.exists('repodir'):
         shutil.rmtree('repodir')
     articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5)
+    articelfinder.initiallize()