File size: 5,960 Bytes
e6ad240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from collections import defaultdict
import json
import zipfile
from lxml import etree

# Define common fonts to ignore
common_fonts = {
    'Times New Roman',
    'Arial',
    'Calibri',
    # Add any other common fonts here
}

# Define elements to ignore
ignored_elements = {
    'proofErr',
    'bookmarkStart',
    'bookmarkEnd',
    'lastRenderedPageBreak',
    'webHidden',
    'numPr',
    'pBdr',
    'ind',
    'spacing',
    'jc',
    'tabs',
    'sectPr',
    'pgMar'
    # Add any other elements to ignore here
}

# Define attributes to ignore
ignored_attributes = {
    'rsidR',
    'rsidRPr',
    'rsidRDefault',
    'rsidP',
    'paraId',
    'textId',
    'rsidR',
    'rsidRPr',
    'rsidDel',
    'rsidP',
    'rsidTr',
    # Add any other attributes to ignore here
}

# Define metadata elements to ignore
ignored_metadata_elements = {
    'application',
    'docSecurity',
    'scaleCrop',
    'linksUpToDate',
    'charactersWithSpaces',
    'hiddenSlides',
    'mmClips',
    'notes',
    'words',
    'characters',
    'pages',
    'lines',
    'paragraphs',
    'company',
    'template',
    # Add any other metadata elements to ignore here
}

def remove_ignored_elements(tree):
    """Remove all ignored elements from the XML tree, except highlights."""
    for elem in tree.xpath(".//*"):
        tag_without_ns = elem.tag.split('}')[-1]
        if tag_without_ns in ignored_elements:
            elem.getparent().remove(elem)
        elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr':  # Check for highlights in rPr
            if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
                elem.getparent().remove(elem)
        else:
            # Remove ignored attributes
            for attr in list(elem.attrib):
                attr_without_ns = attr.split('}')[-1]
                if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
                    del elem.attrib[attr]
    return tree

def etree_to_dict(t):
    """Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes."""
    tag = t.tag.split('}')[-1]  # Remove namespace URI
    if tag in ignored_elements:
        return None

    d = {tag: {} if t.attrib else None}
    children = list(t)
    if children:
        dd = defaultdict(list)
        for dc in filter(None, map(etree_to_dict, children)):
            for k, v in dc.items():
                dd[k].append(v)
        d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}

    if t.attrib:
        # Filter out common fonts and ignored attributes
        filtered_attribs = {}
        for k, v in t.attrib.items():
            k = k.split('}')[-1]  # Remove namespace URI
            if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'):
                if v not in common_fonts:
                    filtered_attribs[k] = v
            elif k not in ignored_attributes and not k.startswith('rsid'):
                filtered_attribs[k] = v
        d[tag].update(filtered_attribs)
    
    if t.text:
        text = t.text.strip()
        # Here we ensure that the text encoding is correctly handled
        text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
        if children or t.attrib:
            if text:
                d[tag]['#text'] = text
        else:
            d[tag] = text
    
    if not t.attrib and not children and not t.text:
        return None

    return d

# Additionally, update the 'remove_ignored_elements' function to fix encoding
def remove_ignored_elements(tree):
    """Remove all ignored elements from the XML tree, except highlights."""
    for elem in tree.xpath(".//*"):
        tag_without_ns = elem.tag.split('}')[-1]
        if tag_without_ns in ignored_elements:
            elem.getparent().remove(elem)
        elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr':  # Check for highlights in rPr
            if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
                elem.getparent().remove(elem)
        else:
            # Remove ignored attributes
            for attr in list(elem.attrib):
                attr_without_ns = attr.split('}')[-1]
                if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
                    del elem.attrib[attr]
    # Decode the text correctly for each XML element
    for elem in tree.xpath(".//text()"):
        elem_text = elem.strip()
        encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore')
        parent = elem.getparent()
        if parent is not None:
            parent.text = encoded_text
    return tree

def extract_metadata(docx):
    """Extract metadata from the document properties, ignoring specified elements."""
    metadata = {}
    with docx.open('docProps/core.xml') as core_xml:
        xml_content = core_xml.read()
        core_tree = etree.XML(xml_content)
        for child in core_tree.getchildren():
            tag = child.tag.split('}')[-1]  # Get tag without namespace
            if tag not in ignored_metadata_elements:
                metadata[tag] = child.text
    return metadata

def process_docx(file_path):
    # Load the document with zipfile and lxml
    with zipfile.ZipFile(file_path) as docx:
        metadata = extract_metadata(docx)
        with docx.open('word/document.xml') as document_xml:
            xml_content = document_xml.read()
            document_tree = etree.XML(xml_content)

            # Remove the ignored elements
            document_tree = remove_ignored_elements(document_tree)

            # Convert the rest of the XML tree to a dictionary
            document_dict = etree_to_dict(document_tree)
            document_dict['metadata'] = metadata  # Add metadata to the document dictionary

            docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2)

            return docx_json