File size: 816 Bytes
5ebeb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417b347
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import xml.etree.ElementTree as ET


class XmlParser:
    def __init__(self, page_xml="./page_xml.xml"):
        self.tree = ET.parse(page_xml, parser=ET.XMLParser(encoding="utf-8"))
        self.root = self.tree.getroot()
        self.namespace = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}"

    def xml_to_txt(self, output_file="page_txt.txt"):
        with open(output_file, "w", encoding="utf-8") as f:
            for textregion in self.root.findall(f".//{self.namespace}TextRegion"):
                for textline in textregion.findall(f".//{self.namespace}TextLine"):
                    text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text
                    f.write(text + "\n")
                f.write("\n")


if __name__ == "__main__":
    pass