lfoppiano commited on
Commit
9c9eab3
·
1 Parent(s): 8893df9

fix dependencies

Browse files
Files changed (1) hide show
  1. grobid_processors.py +19 -4
grobid_processors.py CHANGED
@@ -8,8 +8,6 @@ import grobid_tei_xml
8
  from bs4 import BeautifulSoup
9
  from tqdm import tqdm
10
 
11
- from commons import supermat_tei_parser
12
-
13
 
14
  def get_span_start(type, title=None):
15
  title_ = ' title="' + title + '"' if title is not None else ""
@@ -659,7 +657,7 @@ class XmlProcessor(BaseProcessor):
659
  def parse_xml(self, text):
660
  output_data = OrderedDict()
661
  soup = BeautifulSoup(text, 'xml')
662
- text_blocks_children = supermat_tei_parser.get_children_list(soup, verbose=False)
663
 
664
  passages = []
665
  output_data['passages'] = passages
@@ -680,8 +678,25 @@ class XmlProcessor(BaseProcessor):
680
 
681
  return output_data
682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
 
684
- def get_children_list(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
685
  children = []
686
 
687
  child_name = "p" if use_paragraphs else "s"
 
8
  from bs4 import BeautifulSoup
9
  from tqdm import tqdm
10
 
 
 
11
 
12
  def get_span_start(type, title=None):
13
  title_ = ' title="' + title + '"' if title is not None else ""
 
657
  def parse_xml(self, text):
658
  output_data = OrderedDict()
659
  soup = BeautifulSoup(text, 'xml')
660
+ text_blocks_children = get_children_list_supermat(soup, verbose=False)
661
 
662
  passages = []
663
  output_data['passages'] = passages
 
678
 
679
  return output_data
680
 
681
+ def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
682
+ children = []
683
+
684
+ child_name = "p" if use_paragraphs else "s"
685
+ for child in soup.tei.children:
686
+ if child.name == 'teiHeader':
687
+ pass
688
+ children.append(child.find_all("title"))
689
+ children.extend([subchild.find_all(child_name) for subchild in child.find_all("abstract")])
690
+ children.extend([subchild.find_all(child_name) for subchild in child.find_all("ab", {"type": "keywords"})])
691
+ elif child.name == 'text':
692
+ children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
693
+
694
+ if verbose:
695
+ print(str(children))
696
+
697
+ return children
698
 
699
+ def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
700
  children = []
701
 
702
  child_name = "p" if use_paragraphs else "s"