File size: 1,552 Bytes
93bc8ec
 
5fd26bb
55de44e
93bc8ec
5fd26bb
 
55de44e
93bc8ec
5fd26bb
 
55de44e
5fd26bb
55de44e
5fd26bb
 
55de44e
93bc8ec
5fd26bb
 
55de44e
5fd26bb
 
55de44e
 
 
93bc8ec
55de44e
 
 
 
 
 
 
 
93bc8ec
55de44e
 
 
 
93404c2
55de44e
93bc8ec
55de44e
93bc8ec
55de44e
 
 
 
93404c2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os

from bs4 import BeautifulSoup
from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header
from tests.resources import TEST_DATA_PATH


def test_get_xml_nodes_body_paragraphs():
    with open(os.path.join(TEST_DATA_PATH, "2312.07559.paragraphs.tei.xml"), 'r') as fo:
        soup = BeautifulSoup(fo, 'xml')

    nodes = get_xml_nodes_body(soup, use_paragraphs=True)

    assert len(nodes) == 70


def test_get_xml_nodes_body_sentences():
    with open(os.path.join(TEST_DATA_PATH, "2312.07559.sentences.tei.xml"), 'r') as fo:
        soup = BeautifulSoup(fo, 'xml')

    children = get_xml_nodes_body(soup, use_paragraphs=False)

    assert len(children) == 327


def test_get_xml_nodes_figures():
    with open(os.path.join(TEST_DATA_PATH, "2312.07559.paragraphs.tei.xml"), 'r') as fo:
        soup = BeautifulSoup(fo, 'xml')

    children = get_xml_nodes_figures(soup)

    assert len(children) == 13


def test_get_xml_nodes_header_paragraphs():
    with open(os.path.join(TEST_DATA_PATH, "2312.07559.paragraphs.tei.xml"), 'r') as fo:
        soup = BeautifulSoup(fo, 'xml')

    children = get_xml_nodes_header(soup)

    assert sum([len(child) for k, child in children.items()]) == 8


def test_get_xml_nodes_header_sentences():
    with open(os.path.join(TEST_DATA_PATH, "2312.07559.sentences.tei.xml"), 'r') as fo:
        soup = BeautifulSoup(fo, 'xml')

    children = get_xml_nodes_header(soup, use_paragraphs=False)

    assert sum([len(child) for k, child in children.items()]) == 15