ManishThota commited on
Commit
54aff35
1 Parent(s): a1e792f

Delete doc2json.py

Browse files
Files changed (1) hide show
  1. doc2json.py +0 -181
doc2json.py DELETED
@@ -1,181 +0,0 @@
1
- from collections import defaultdict
2
- import json
3
- import zipfile
4
- from lxml import etree
5
-
6
- # Define common fonts to ignore
7
- common_fonts = {
8
- 'Times New Roman',
9
- 'Arial',
10
- 'Calibri',
11
- # Add any other common fonts here
12
- }
13
-
14
- # Define elements to ignore
15
- ignored_elements = {
16
- 'proofErr',
17
- 'bookmarkStart',
18
- 'bookmarkEnd',
19
- 'lastRenderedPageBreak',
20
- 'webHidden',
21
- 'numPr',
22
- 'pBdr',
23
- 'ind',
24
- 'spacing',
25
- 'jc',
26
- 'tabs',
27
- 'sectPr',
28
- 'pgMar'
29
- # Add any other elements to ignore here
30
- }
31
-
32
- # Define attributes to ignore
33
- ignored_attributes = {
34
- 'rsidR',
35
- 'rsidRPr',
36
- 'rsidRDefault',
37
- 'rsidP',
38
- 'paraId',
39
- 'textId',
40
- 'rsidR',
41
- 'rsidRPr',
42
- 'rsidDel',
43
- 'rsidP',
44
- 'rsidTr',
45
- # Add any other attributes to ignore here
46
- }
47
-
48
- # Define metadata elements to ignore
49
- ignored_metadata_elements = {
50
- 'application',
51
- 'docSecurity',
52
- 'scaleCrop',
53
- 'linksUpToDate',
54
- 'charactersWithSpaces',
55
- 'hiddenSlides',
56
- 'mmClips',
57
- 'notes',
58
- 'words',
59
- 'characters',
60
- 'pages',
61
- 'lines',
62
- 'paragraphs',
63
- 'company',
64
- 'template',
65
- # Add any other metadata elements to ignore here
66
- }
67
-
68
- def remove_ignored_elements(tree):
69
- """Remove all ignored elements from the XML tree, except highlights."""
70
- for elem in tree.xpath(".//*"):
71
- tag_without_ns = elem.tag.split('}')[-1]
72
- if tag_without_ns in ignored_elements:
73
- elem.getparent().remove(elem)
74
- elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr
75
- if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
76
- elem.getparent().remove(elem)
77
- else:
78
- # Remove ignored attributes
79
- for attr in list(elem.attrib):
80
- attr_without_ns = attr.split('}')[-1]
81
- if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
82
- del elem.attrib[attr]
83
- return tree
84
-
85
- def etree_to_dict(t):
86
- """Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes."""
87
- tag = t.tag.split('}')[-1] # Remove namespace URI
88
- if tag in ignored_elements:
89
- return None
90
-
91
- d = {tag: {} if t.attrib else None}
92
- children = list(t)
93
- if children:
94
- dd = defaultdict(list)
95
- for dc in filter(None, map(etree_to_dict, children)):
96
- for k, v in dc.items():
97
- dd[k].append(v)
98
- d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
99
-
100
- if t.attrib:
101
- # Filter out common fonts and ignored attributes
102
- filtered_attribs = {}
103
- for k, v in t.attrib.items():
104
- k = k.split('}')[-1] # Remove namespace URI
105
- if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'):
106
- if v not in common_fonts:
107
- filtered_attribs[k] = v
108
- elif k not in ignored_attributes and not k.startswith('rsid'):
109
- filtered_attribs[k] = v
110
- d[tag].update(filtered_attribs)
111
-
112
- if t.text:
113
- text = t.text.strip()
114
- # Here we ensure that the text encoding is correctly handled
115
- text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
116
- if children or t.attrib:
117
- if text:
118
- d[tag]['#text'] = text
119
- else:
120
- d[tag] = text
121
-
122
- if not t.attrib and not children and not t.text:
123
- return None
124
-
125
- return d
126
-
127
- # Additionally, update the 'remove_ignored_elements' function to fix encoding
128
- def remove_ignored_elements(tree):
129
- """Remove all ignored elements from the XML tree, except highlights."""
130
- for elem in tree.xpath(".//*"):
131
- tag_without_ns = elem.tag.split('}')[-1]
132
- if tag_without_ns in ignored_elements:
133
- elem.getparent().remove(elem)
134
- elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr
135
- if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
136
- elem.getparent().remove(elem)
137
- else:
138
- # Remove ignored attributes
139
- for attr in list(elem.attrib):
140
- attr_without_ns = attr.split('}')[-1]
141
- if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
142
- del elem.attrib[attr]
143
- # Decode the text correctly for each XML element
144
- for elem in tree.xpath(".//text()"):
145
- elem_text = elem.strip()
146
- encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore')
147
- parent = elem.getparent()
148
- if parent is not None:
149
- parent.text = encoded_text
150
- return tree
151
-
152
- def extract_metadata(docx):
153
- """Extract metadata from the document properties, ignoring specified elements."""
154
- metadata = {}
155
- with docx.open('docProps/core.xml') as core_xml:
156
- xml_content = core_xml.read()
157
- core_tree = etree.XML(xml_content)
158
- for child in core_tree.getchildren():
159
- tag = child.tag.split('}')[-1] # Get tag without namespace
160
- if tag not in ignored_metadata_elements:
161
- metadata[tag] = child.text
162
- return metadata
163
-
164
- def process_docx(file_path):
165
- # Load the document with zipfile and lxml
166
- with zipfile.ZipFile(file_path) as docx:
167
- metadata = extract_metadata(docx)
168
- with docx.open('word/document.xml') as document_xml:
169
- xml_content = document_xml.read()
170
- document_tree = etree.XML(xml_content)
171
-
172
- # Remove the ignored elements
173
- document_tree = remove_ignored_elements(document_tree)
174
-
175
- # Convert the rest of the XML tree to a dictionary
176
- document_dict = etree_to_dict(document_tree)
177
- document_dict['metadata'] = metadata # Add metadata to the document dictionary
178
-
179
- docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2)
180
-
181
- return docx_json