Nils Durner commited on
Commit
e6ad240
1 Parent(s): 679fce2

docx support

Browse files
Files changed (3) hide show
  1. app.py +11 -6
  2. doc2json.py +181 -0
  3. requirements.txt +2 -1
app.py CHANGED
@@ -4,6 +4,8 @@ import os
4
  from openai import OpenAI
5
  import json
6
 
 
 
7
  dump_controls = False
8
  log_to_console = False
9
 
@@ -48,13 +50,16 @@ def add_text(history, text):
48
  return history, gr.Textbox(value="", interactive=False)
49
 
50
  def add_file(history, file):
51
- with open(file.name, mode="rb") as f:
52
- content = f.read()
 
 
 
53
 
54
- if isinstance(content, bytes):
55
- content = content.decode('utf-8', 'replace')
56
- else:
57
- content = str(content)
58
 
59
  fn = os.path.basename(file.name)
60
  history = history + [(f'```{fn}\n{content}\n```', None)]
 
4
  from openai import OpenAI
5
  import json
6
 
7
+ from doc2json import process_docx
8
+
9
  dump_controls = False
10
  log_to_console = False
11
 
 
50
  return history, gr.Textbox(value="", interactive=False)
51
 
52
  def add_file(history, file):
53
+ if file.name.endswith(".docx"):
54
+ content = process_docx(file.name)
55
+ else:
56
+ with open(file.name, mode="rb") as f:
57
+ content = f.read()
58
 
59
+ if isinstance(content, bytes):
60
+ content = content.decode('utf-8', 'replace')
61
+ else:
62
+ content = str(content)
63
 
64
  fn = os.path.basename(file.name)
65
  history = history + [(f'```{fn}\n{content}\n```', None)]
doc2json.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import json
3
+ import zipfile
4
+ from lxml import etree
5
+
6
+ # Define common fonts to ignore
7
+ common_fonts = {
8
+ 'Times New Roman',
9
+ 'Arial',
10
+ 'Calibri',
11
+ # Add any other common fonts here
12
+ }
13
+
14
+ # Define elements to ignore
15
+ ignored_elements = {
16
+ 'proofErr',
17
+ 'bookmarkStart',
18
+ 'bookmarkEnd',
19
+ 'lastRenderedPageBreak',
20
+ 'webHidden',
21
+ 'numPr',
22
+ 'pBdr',
23
+ 'ind',
24
+ 'spacing',
25
+ 'jc',
26
+ 'tabs',
27
+ 'sectPr',
28
+ 'pgMar'
29
+ # Add any other elements to ignore here
30
+ }
31
+
32
+ # Define attributes to ignore
33
+ ignored_attributes = {
34
+ 'rsidR',
35
+ 'rsidRPr',
36
+ 'rsidRDefault',
37
+ 'rsidP',
38
+ 'paraId',
39
+ 'textId',
40
+ 'rsidR',
41
+ 'rsidRPr',
42
+ 'rsidDel',
43
+ 'rsidP',
44
+ 'rsidTr',
45
+ # Add any other attributes to ignore here
46
+ }
47
+
48
+ # Define metadata elements to ignore
49
+ ignored_metadata_elements = {
50
+ 'application',
51
+ 'docSecurity',
52
+ 'scaleCrop',
53
+ 'linksUpToDate',
54
+ 'charactersWithSpaces',
55
+ 'hiddenSlides',
56
+ 'mmClips',
57
+ 'notes',
58
+ 'words',
59
+ 'characters',
60
+ 'pages',
61
+ 'lines',
62
+ 'paragraphs',
63
+ 'company',
64
+ 'template',
65
+ # Add any other metadata elements to ignore here
66
+ }
67
+
68
+ def remove_ignored_elements(tree):
69
+ """Remove all ignored elements from the XML tree, except highlights."""
70
+ for elem in tree.xpath(".//*"):
71
+ tag_without_ns = elem.tag.split('}')[-1]
72
+ if tag_without_ns in ignored_elements:
73
+ elem.getparent().remove(elem)
74
+ elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr
75
+ if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
76
+ elem.getparent().remove(elem)
77
+ else:
78
+ # Remove ignored attributes
79
+ for attr in list(elem.attrib):
80
+ attr_without_ns = attr.split('}')[-1]
81
+ if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
82
+ del elem.attrib[attr]
83
+ return tree
84
+
85
+ def etree_to_dict(t):
86
+ """Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes."""
87
+ tag = t.tag.split('}')[-1] # Remove namespace URI
88
+ if tag in ignored_elements:
89
+ return None
90
+
91
+ d = {tag: {} if t.attrib else None}
92
+ children = list(t)
93
+ if children:
94
+ dd = defaultdict(list)
95
+ for dc in filter(None, map(etree_to_dict, children)):
96
+ for k, v in dc.items():
97
+ dd[k].append(v)
98
+ d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
99
+
100
+ if t.attrib:
101
+ # Filter out common fonts and ignored attributes
102
+ filtered_attribs = {}
103
+ for k, v in t.attrib.items():
104
+ k = k.split('}')[-1] # Remove namespace URI
105
+ if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'):
106
+ if v not in common_fonts:
107
+ filtered_attribs[k] = v
108
+ elif k not in ignored_attributes and not k.startswith('rsid'):
109
+ filtered_attribs[k] = v
110
+ d[tag].update(filtered_attribs)
111
+
112
+ if t.text:
113
+ text = t.text.strip()
114
+ # Here we ensure that the text encoding is correctly handled
115
+ text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
116
+ if children or t.attrib:
117
+ if text:
118
+ d[tag]['#text'] = text
119
+ else:
120
+ d[tag] = text
121
+
122
+ if not t.attrib and not children and not t.text:
123
+ return None
124
+
125
+ return d
126
+
127
+ # Additionally, update the 'remove_ignored_elements' function to fix encoding
128
+ def remove_ignored_elements(tree):
129
+ """Remove all ignored elements from the XML tree, except highlights."""
130
+ for elem in tree.xpath(".//*"):
131
+ tag_without_ns = elem.tag.split('}')[-1]
132
+ if tag_without_ns in ignored_elements:
133
+ elem.getparent().remove(elem)
134
+ elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': # Check for highlights in rPr
135
+ if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
136
+ elem.getparent().remove(elem)
137
+ else:
138
+ # Remove ignored attributes
139
+ for attr in list(elem.attrib):
140
+ attr_without_ns = attr.split('}')[-1]
141
+ if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
142
+ del elem.attrib[attr]
143
+ # Decode the text correctly for each XML element
144
+ for elem in tree.xpath(".//text()"):
145
+ elem_text = elem.strip()
146
+ encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore')
147
+ parent = elem.getparent()
148
+ if parent is not None:
149
+ parent.text = encoded_text
150
+ return tree
151
+
152
+ def extract_metadata(docx):
153
+ """Extract metadata from the document properties, ignoring specified elements."""
154
+ metadata = {}
155
+ with docx.open('docProps/core.xml') as core_xml:
156
+ xml_content = core_xml.read()
157
+ core_tree = etree.XML(xml_content)
158
+ for child in core_tree.getchildren():
159
+ tag = child.tag.split('}')[-1] # Get tag without namespace
160
+ if tag not in ignored_metadata_elements:
161
+ metadata[tag] = child.text
162
+ return metadata
163
+
164
+ def process_docx(file_path):
165
+ # Load the document with zipfile and lxml
166
+ with zipfile.ZipFile(file_path) as docx:
167
+ metadata = extract_metadata(docx)
168
+ with docx.open('word/document.xml') as document_xml:
169
+ xml_content = document_xml.read()
170
+ document_tree = etree.XML(xml_content)
171
+
172
+ # Remove the ignored elements
173
+ document_tree = remove_ignored_elements(document_tree)
174
+
175
+ # Convert the rest of the XML tree to a dictionary
176
+ document_dict = etree_to_dict(document_tree)
177
+ document_dict['metadata'] = metadata # Add metadata to the document dictionary
178
+
179
+ docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2)
180
+
181
+ return docx_json
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  gradio
2
- openai >= 1.0.0
 
 
1
  gradio
2
+ openai >= 1.0.0
3
+ lxml