oai_chat / doc2json.py
Nils Durner
docx support
e6ad240
from collections import defaultdict
import json
import zipfile
from lxml import etree
# Define common fonts to ignore
common_fonts = {
'Times New Roman',
'Arial',
'Calibri',
# Add any other common fonts here
}
# Define elements to ignore
ignored_elements = {
'proofErr',
'bookmarkStart',
'bookmarkEnd',
'lastRenderedPageBreak',
'webHidden',
'numPr',
'pBdr',
'ind',
'spacing',
'jc',
'tabs',
'sectPr',
'pgMar'
# Add any other elements to ignore here
}
# Define attributes to ignore
ignored_attributes = {
'rsidR',
'rsidRPr',
'rsidRDefault',
'rsidP',
'paraId',
'textId',
'rsidR',
'rsidRPr',
'rsidDel',
'rsidP',
'rsidTr',
# Add any other attributes to ignore here
}
# Define metadata elements to ignore
ignored_metadata_elements = {
'application',
'docSecurity',
'scaleCrop',
'linksUpToDate',
'charactersWithSpaces',
'hiddenSlides',
'mmClips',
'notes',
'words',
'characters',
'pages',
'lines',
'paragraphs',
'company',
'template',
# Add any other metadata elements to ignore here
}
def remove_ignored_elements(tree):
"""Remove all ignored elements from the XML tree, except highlights."""
for elem in tree.xpath(".//*"):
tag_without_ns = elem.tag.split('}')[-1]
if tag_without_ns in ignored_elements:
elem.getparent().remove(elem)
elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr
if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
elem.getparent().remove(elem)
else:
# Remove ignored attributes
for attr in list(elem.attrib):
attr_without_ns = attr.split('}')[-1]
if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
del elem.attrib[attr]
return tree
def etree_to_dict(t):
"""Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes."""
tag = t.tag.split('}')[-1] # Remove namespace URI
if tag in ignored_elements:
return None
d = {tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in filter(None, map(etree_to_dict, children)):
for k, v in dc.items():
dd[k].append(v)
d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
if t.attrib:
# Filter out common fonts and ignored attributes
filtered_attribs = {}
for k, v in t.attrib.items():
k = k.split('}')[-1] # Remove namespace URI
if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'):
if v not in common_fonts:
filtered_attribs[k] = v
elif k not in ignored_attributes and not k.startswith('rsid'):
filtered_attribs[k] = v
d[tag].update(filtered_attribs)
if t.text:
text = t.text.strip()
# Here we ensure that the text encoding is correctly handled
text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
if children or t.attrib:
if text:
d[tag]['#text'] = text
else:
d[tag] = text
if not t.attrib and not children and not t.text:
return None
return d
# Additionally, update the 'remove_ignored_elements' function to fix encoding
def remove_ignored_elements(tree):
"""Remove all ignored elements from the XML tree, except highlights."""
for elem in tree.xpath(".//*"):
tag_without_ns = elem.tag.split('}')[-1]
if tag_without_ns in ignored_elements:
elem.getparent().remove(elem)
elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr
if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
elem.getparent().remove(elem)
else:
# Remove ignored attributes
for attr in list(elem.attrib):
attr_without_ns = attr.split('}')[-1]
if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
del elem.attrib[attr]
# Decode the text correctly for each XML element
for elem in tree.xpath(".//text()"):
elem_text = elem.strip()
encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore')
parent = elem.getparent()
if parent is not None:
parent.text = encoded_text
return tree
def extract_metadata(docx):
"""Extract metadata from the document properties, ignoring specified elements."""
metadata = {}
with docx.open('docProps/core.xml') as core_xml:
xml_content = core_xml.read()
core_tree = etree.XML(xml_content)
for child in core_tree.getchildren():
tag = child.tag.split('}')[-1] # Get tag without namespace
if tag not in ignored_metadata_elements:
metadata[tag] = child.text
return metadata
def process_docx(file_path):
# Load the document with zipfile and lxml
with zipfile.ZipFile(file_path) as docx:
metadata = extract_metadata(docx)
with docx.open('word/document.xml') as document_xml:
xml_content = document_xml.read()
document_tree = etree.XML(xml_content)
# Remove the ignored elements
document_tree = remove_ignored_elements(document_tree)
# Convert the rest of the XML tree to a dictionary
document_dict = etree_to_dict(document_tree)
document_dict['metadata'] = metadata # Add metadata to the document dictionary
docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2)
return docx_json