Spaces:
Runtime error
Runtime error
File size: 5,960 Bytes
e6ad240 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
from collections import defaultdict
import json
import zipfile
from lxml import etree
# Define common fonts to ignore
common_fonts = {
'Times New Roman',
'Arial',
'Calibri',
# Add any other common fonts here
}
# Define elements to ignore
ignored_elements = {
'proofErr',
'bookmarkStart',
'bookmarkEnd',
'lastRenderedPageBreak',
'webHidden',
'numPr',
'pBdr',
'ind',
'spacing',
'jc',
'tabs',
'sectPr',
'pgMar'
# Add any other elements to ignore here
}
# Define attributes to ignore
ignored_attributes = {
'rsidR',
'rsidRPr',
'rsidRDefault',
'rsidP',
'paraId',
'textId',
'rsidR',
'rsidRPr',
'rsidDel',
'rsidP',
'rsidTr',
# Add any other attributes to ignore here
}
# Define metadata elements to ignore
ignored_metadata_elements = {
'application',
'docSecurity',
'scaleCrop',
'linksUpToDate',
'charactersWithSpaces',
'hiddenSlides',
'mmClips',
'notes',
'words',
'characters',
'pages',
'lines',
'paragraphs',
'company',
'template',
# Add any other metadata elements to ignore here
}
def remove_ignored_elements(tree):
"""Remove all ignored elements from the XML tree, except highlights."""
for elem in tree.xpath(".//*"):
tag_without_ns = elem.tag.split('}')[-1]
if tag_without_ns in ignored_elements:
elem.getparent().remove(elem)
elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr
if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
elem.getparent().remove(elem)
else:
# Remove ignored attributes
for attr in list(elem.attrib):
attr_without_ns = attr.split('}')[-1]
if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
del elem.attrib[attr]
return tree
def etree_to_dict(t):
"""Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes."""
tag = t.tag.split('}')[-1] # Remove namespace URI
if tag in ignored_elements:
return None
d = {tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in filter(None, map(etree_to_dict, children)):
for k, v in dc.items():
dd[k].append(v)
d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
if t.attrib:
# Filter out common fonts and ignored attributes
filtered_attribs = {}
for k, v in t.attrib.items():
k = k.split('}')[-1] # Remove namespace URI
if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'):
if v not in common_fonts:
filtered_attribs[k] = v
elif k not in ignored_attributes and not k.startswith('rsid'):
filtered_attribs[k] = v
d[tag].update(filtered_attribs)
if t.text:
text = t.text.strip()
# Here we ensure that the text encoding is correctly handled
text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
if children or t.attrib:
if text:
d[tag]['#text'] = text
else:
d[tag] = text
if not t.attrib and not children and not t.text:
return None
return d
# Additionally, update the 'remove_ignored_elements' function to fix encoding
def remove_ignored_elements(tree):
"""Remove all ignored elements from the XML tree, except highlights."""
for elem in tree.xpath(".//*"):
tag_without_ns = elem.tag.split('}')[-1]
if tag_without_ns in ignored_elements:
elem.getparent().remove(elem)
elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr
if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
elem.getparent().remove(elem)
else:
# Remove ignored attributes
for attr in list(elem.attrib):
attr_without_ns = attr.split('}')[-1]
if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
del elem.attrib[attr]
# Decode the text correctly for each XML element
for elem in tree.xpath(".//text()"):
elem_text = elem.strip()
encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore')
parent = elem.getparent()
if parent is not None:
parent.text = encoded_text
return tree
def extract_metadata(docx):
"""Extract metadata from the document properties, ignoring specified elements."""
metadata = {}
with docx.open('docProps/core.xml') as core_xml:
xml_content = core_xml.read()
core_tree = etree.XML(xml_content)
for child in core_tree.getchildren():
tag = child.tag.split('}')[-1] # Get tag without namespace
if tag not in ignored_metadata_elements:
metadata[tag] = child.text
return metadata
def process_docx(file_path):
# Load the document with zipfile and lxml
with zipfile.ZipFile(file_path) as docx:
metadata = extract_metadata(docx)
with docx.open('word/document.xml') as document_xml:
xml_content = document_xml.read()
document_tree = etree.XML(xml_content)
# Remove the ignored elements
document_tree = remove_ignored_elements(document_tree)
# Convert the rest of the XML tree to a dictionary
document_dict = etree_to_dict(document_tree)
document_dict['metadata'] = metadata # Add metadata to the document dictionary
docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2)
return docx_json
|