Spaces:

ManishThota
/

Groq

Sleeping

App Files Files Community

ManishThota commited on Feb 29

Commit

54aff35

•

1 Parent(s): a1e792f

Delete doc2json.py

Browse files

Files changed (1) hide show

doc2json.py +0 -181

doc2json.py DELETED Viewed

@@ -1,181 +0,0 @@
-from collections import defaultdict
-import json
-import zipfile
-from lxml import etree
-# Define common fonts to ignore
-common_fonts = {
-    'Times New Roman',
-    'Arial',
-    'Calibri',
-    # Add any other common fonts here
-}
-# Define elements to ignore
-ignored_elements = {
-    'proofErr',
-    'bookmarkStart',
-    'bookmarkEnd',
-    'lastRenderedPageBreak',
-    'webHidden',
-    'numPr',
-    'pBdr',
-    'ind',
-    'spacing',
-    'jc',
-    'tabs',
-    'sectPr',
-    'pgMar'
-    # Add any other elements to ignore here
-}
-# Define attributes to ignore
-ignored_attributes = {
-    'rsidR',
-    'rsidRPr',
-    'rsidRDefault',
-    'rsidP',
-    'paraId',
-    'textId',
-    'rsidR',
-    'rsidRPr',
-    'rsidDel',
-    'rsidP',
-    'rsidTr',
-    # Add any other attributes to ignore here
-}
-# Define metadata elements to ignore
-ignored_metadata_elements = {
-    'application',
-    'docSecurity',
-    'scaleCrop',
-    'linksUpToDate',
-    'charactersWithSpaces',
-    'hiddenSlides',
-    'mmClips',
-    'notes',
-    'words',
-    'characters',
-    'pages',
-    'lines',
-    'paragraphs',
-    'company',
-    'template',
-    # Add any other metadata elements to ignore here
-}
-def remove_ignored_elements(tree):
-    """Remove all ignored elements from the XML tree, except highlights."""
-    for elem in tree.xpath(".//*"):
-        tag_without_ns = elem.tag.split('}')[-1]
-        if tag_without_ns in ignored_elements:
-            elem.getparent().remove(elem)
-        elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr':  # Check for highlights in rPr
-            if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
-                elem.getparent().remove(elem)
-        else:
-            # Remove ignored attributes
-            for attr in list(elem.attrib):
-                attr_without_ns = attr.split('}')[-1]
-                if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
-                    del elem.attrib[attr]
-    return tree
-def etree_to_dict(t):
-    """Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes."""
-    tag = t.tag.split('}')[-1]  # Remove namespace URI
-    if tag in ignored_elements:
-        return None
-    d = {tag: {} if t.attrib else None}
-    children = list(t)
-    if children:
-        dd = defaultdict(list)
-        for dc in filter(None, map(etree_to_dict, children)):
-            for k, v in dc.items():
-                dd[k].append(v)
-        d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
-    if t.attrib:
-        # Filter out common fonts and ignored attributes
-        filtered_attribs = {}
-        for k, v in t.attrib.items():
-            k = k.split('}')[-1]  # Remove namespace URI
-            if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'):
-                if v not in common_fonts:
-                    filtered_attribs[k] = v
-            elif k not in ignored_attributes and not k.startswith('rsid'):
-                filtered_attribs[k] = v
-        d[tag].update(filtered_attribs)
-    if t.text:
-        text = t.text.strip()
-        # Here we ensure that the text encoding is correctly handled
-        text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
-        if children or t.attrib:
-            if text:
-                d[tag]['#text'] = text
-        else:
-            d[tag] = text
-    if not t.attrib and not children and not t.text:
-        return None
-    return d
-# Additionally, update the 'remove_ignored_elements' function to fix encoding
-def remove_ignored_elements(tree):
-    """Remove all ignored elements from the XML tree, except highlights."""
-    for elem in tree.xpath(".//*"):
-        tag_without_ns = elem.tag.split('}')[-1]
-        if tag_without_ns in ignored_elements:
-            elem.getparent().remove(elem)
-        elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr':  # Check for highlights in rPr
-            if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
-                elem.getparent().remove(elem)
-        else:
-            # Remove ignored attributes
-            for attr in list(elem.attrib):
-                attr_without_ns = attr.split('}')[-1]
-                if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
-                    del elem.attrib[attr]
-    # Decode the text correctly for each XML element
-    for elem in tree.xpath(".//text()"):
-        elem_text = elem.strip()
-        encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore')
-        parent = elem.getparent()
-        if parent is not None:
-            parent.text = encoded_text
-    return tree
-def extract_metadata(docx):
-    """Extract metadata from the document properties, ignoring specified elements."""
-    metadata = {}
-    with docx.open('docProps/core.xml') as core_xml:
-        xml_content = core_xml.read()
-        core_tree = etree.XML(xml_content)
-        for child in core_tree.getchildren():
-            tag = child.tag.split('}')[-1]  # Get tag without namespace
-            if tag not in ignored_metadata_elements:
-                metadata[tag] = child.text
-    return metadata
-def process_docx(file_path):
-    # Load the document with zipfile and lxml
-    with zipfile.ZipFile(file_path) as docx:
-        metadata = extract_metadata(docx)
-        with docx.open('word/document.xml') as document_xml:
-            xml_content = document_xml.read()
-            document_tree = etree.XML(xml_content)
-            # Remove the ignored elements
-            document_tree = remove_ignored_elements(document_tree)
-            # Convert the rest of the XML tree to a dictionary
-            document_dict = etree_to_dict(document_tree)
-            document_dict['metadata'] = metadata  # Add metadata to the document dictionary
-            docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2)
-            return docx_json