Sanshruth commited on
Commit
5ff5737
·
verified ·
1 Parent(s): 41c4568

initial_commit

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -0
  2. Fast_Sans_Bionic.ttf +3 -0
  3. Fast_Sans_Dotted.ttf +3 -0
  4. Fast_Serif.ttf +0 -0
  5. app.py +316 -0
  6. image.jpeg +0 -0
  7. requirements.txt +4 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Fast_Sans_Bionic.ttf filter=lfs diff=lfs merge=lfs -text
37
+ Fast_Sans_Dotted.ttf filter=lfs diff=lfs merge=lfs -text
Fast_Sans_Bionic.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e721ed03e58b0a7a964d425d4e851d421fb30ac1bf4c5f6501f339a121b97b6e
3
+ size 1868664
Fast_Sans_Dotted.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8299839ea5626ca69924d9e60d11a6bcb22eee97621b11f9b9005c8745631454
3
+ size 1738812
Fast_Serif.ttf ADDED
Binary file (930 kB). View file
 
app.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pdf2docx import Converter
3
+ from docx import Document
4
+ import os
5
+ import glob
6
+ import base64
7
+ from docx.shared import Inches, Pt
8
+ from docx.oxml import OxmlElement
9
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
10
+ import xml.etree.ElementTree as ET
11
+
12
+ def find_ttf_fonts():
13
+ files = glob.glob('**/*.ttf', recursive=True)
14
+
15
+ def embed_font_in_html(font_path, font_name, html_content):
16
+ with open(font_path, "rb") as font_file:
17
+ font_data = font_file.read()
18
+ encoded_font = base64.b64encode(font_data).decode('utf-8')
19
+
20
+ font_style = f"""
21
+ <style>
22
+ @font-face {{
23
+ font-family: '{font_name}';
24
+ src: url(data:font/ttf;base64,{encoded_font}) format('truetype');
25
+ }}
26
+ body {{
27
+ font-family: '{font_name}', Arial, sans-serif;
28
+ margin: 0;
29
+ padding: 0;
30
+ background-color: white;
31
+ }}
32
+ .page {{
33
+ position: relative;
34
+ width: 8.5in;
35
+ margin: 20px auto;
36
+ padding: 20px;
37
+ box-sizing: border-box;
38
+ background-color: white;
39
+ box-shadow: 0 0 10px rgba(0,0,0,0.1);
40
+ }}
41
+ .paragraph {{
42
+ margin: 0;
43
+ padding: 0;
44
+ position: relative;
45
+ }}
46
+ .image-container {{
47
+ display: inline-block;
48
+ position: relative;
49
+ vertical-align: middle;
50
+ }}
51
+ img {{
52
+ max-width: 100%;
53
+ height: auto;
54
+ display: inline-block;
55
+ vertical-align: middle;
56
+ }}
57
+ table {{
58
+ border-collapse: collapse;
59
+ width: 100%;
60
+ margin: 10px 0;
61
+ }}
62
+ td, th {{
63
+ border: 1px solid black;
64
+ padding: 8px;
65
+ position: relative;
66
+ }}
67
+ </style>
68
+ """
69
+ return font_style + html_content
70
+
71
+ def extract_images_from_doc(doc):
72
+ images = {}
73
+ for rel in doc.part.rels.values():
74
+ if "image" in rel.reltype:
75
+ try:
76
+ image_data = rel.target_part.blob
77
+ image_type = rel.target_part.content_type.split('/')[-1]
78
+ if image_type.lower() not in ['jpeg', 'jpg', 'png', 'gif']:
79
+ image_type = 'png'
80
+ encoded_image = base64.b64encode(image_data).decode('utf-8')
81
+ images[rel.rId] = f"data:image/{image_type};base64,{encoded_image}"
82
+ except Exception as e:
83
+ print(f"Error processing image: {str(e)}")
84
+ continue
85
+ return images
86
+
87
+ def get_image_position(element):
88
+ try:
89
+ anchor = element.find('.//wp:anchor',
90
+ {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
91
+ if anchor is not None:
92
+ pos_h = anchor.find('.//wp:positionH',
93
+ {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
94
+ pos_v = anchor.find('.//wp:positionV',
95
+ {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
96
+
97
+ if pos_h is not None and pos_v is not None:
98
+ x = pos_h.find('.//wp:posOffset',
99
+ {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
100
+ y = pos_v.find('.//wp:posOffset',
101
+ {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
102
+
103
+ if x is not None and y is not None:
104
+ return {
105
+ 'x': int(x.text) / 914400,
106
+ 'y': int(y.text) / 914400
107
+ }
108
+ except Exception:
109
+ pass
110
+ return None
111
+
112
+ def process_paragraph(paragraph, images_dict):
113
+ html_content = '<div class="paragraph">'
114
+
115
+ if paragraph.alignment == WD_ALIGN_PARAGRAPH.CENTER:
116
+ html_content += '<div style="text-align: center;">'
117
+ elif paragraph.alignment == WD_ALIGN_PARAGRAPH.RIGHT:
118
+ html_content += '<div style="text-align: right;">'
119
+ else:
120
+ html_content += '<div>'
121
+
122
+ for run in paragraph.runs:
123
+ style = []
124
+ if run.bold: style.append('font-weight: bold')
125
+ if run.italic: style.append('font-style: italic')
126
+ if run.underline: style.append('text-decoration: underline')
127
+ if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')
128
+
129
+ drawing_elements = run._element.findall('.//w:drawing',
130
+ {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
131
+
132
+ for drawing in drawing_elements:
133
+ blip = drawing.find('.//a:blip',
134
+ {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
135
+ if blip is not None:
136
+ image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
137
+ if image_rel_id in images_dict:
138
+ position = get_image_position(drawing)
139
+ if position:
140
+ style_pos = f"position: absolute; left: {position['x']}in; top: {position['y']}in;"
141
+ html_content += f'<div class="image-container" style="{style_pos}">'
142
+ html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>'
143
+ html_content += '</div>'
144
+ else:
145
+ html_content += f'<div class="image-container">'
146
+ html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>'
147
+ html_content += '</div>'
148
+
149
+ style_str = '; '.join(style)
150
+ if run.text.strip():
151
+ html_content += f'<span style="{style_str}">{run.text}</span>'
152
+
153
+ html_content += '</div></div>'
154
+ return html_content
155
+
156
+ def process_table(table, images_dict):
157
+ html_content = '<table>'
158
+ for row in table.rows:
159
+ html_content += '<tr>'
160
+ for cell in row.cells:
161
+ html_content += '<td>'
162
+ for paragraph in cell.paragraphs:
163
+ for run in paragraph.runs:
164
+ style = []
165
+ if run.bold: style.append('font-weight: bold')
166
+ if run.italic: style.append('font-style: italic')
167
+ if run.underline: style.append('text-decoration: underline')
168
+ if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')
169
+
170
+ drawing_elements = run._element.findall('.//w:drawing',
171
+ {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
172
+
173
+ for drawing in drawing_elements:
174
+ blip = drawing.find('.//a:blip',
175
+ {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
176
+ if blip is not None:
177
+ image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
178
+ if image_rel_id in images_dict:
179
+ html_content += f'<div class="image-container">'
180
+ html_content += f'<img src="{images_dict[image_rel_id]}" alt="Table Cell Image"/>'
181
+ html_content += '</div>'
182
+
183
+ style_str = '; '.join(style)
184
+ if run.text.strip():
185
+ html_content += f'<span style="{style_str}">{run.text}</span>'
186
+ html_content += '</td>'
187
+ html_content += '</tr>'
188
+ html_content += '</table>'
189
+ return html_content
190
+
191
+ def pdf_to_html(pdf_file, font_name):
192
+ if not pdf_file:
193
+ return None
194
+
195
+ try:
196
+ docx_filename = pdf_file.name.replace('.pdf', '.docx')
197
+ cv = Converter(pdf_file.name)
198
+ cv.convert(docx_filename)
199
+ cv.close()
200
+
201
+ doc = Document(docx_filename)
202
+ images_dict = extract_images_from_doc(doc)
203
+
204
+ html_content = """<!DOCTYPE html>
205
+ <html>
206
+ <head>
207
+ <meta charset='utf-8'>
208
+ <title>Converted Document</title>
209
+ </head>
210
+ <body>
211
+ <div class="page">
212
+ """
213
+
214
+ paragraph_map = {}
215
+ current_paragraph_index = 0
216
+ for para in doc.paragraphs:
217
+ paragraph_map[para._element] = current_paragraph_index
218
+ current_paragraph_index += 1
219
+
220
+ for element in doc.element.body:
221
+ if element.tag.endswith('p'):
222
+ if element in paragraph_map:
223
+ paragraph = doc.paragraphs[paragraph_map[element]]
224
+ html_content += process_paragraph(paragraph, images_dict)
225
+ elif element.tag.endswith('tbl'):
226
+ table_index = len([e for e in doc.element.body[:doc.element.body.index(element)]
227
+ if e.tag.endswith('tbl')])
228
+ html_content += process_table(doc.tables[table_index], images_dict)
229
+
230
+ html_content += "</div></body></html>"
231
+
232
+ ttf_files = {os.path.basename(f): f for f in find_ttf_fonts()}
233
+ if font_name in ttf_files:
234
+ font_path = ttf_files[font_name]
235
+ font_name_clean = os.path.splitext(font_name)[0]
236
+ html_content = embed_font_in_html(font_path, font_name_clean, html_content)
237
+
238
+ html_filename = "output_with_font.html"
239
+ with open(html_filename, "w", encoding="utf-8") as html_file:
240
+ html_file.write(html_content)
241
+
242
+ os.remove(docx_filename)
243
+ return html_filename
244
+
245
+ except Exception as e:
246
+ print(f"Error in pdf_to_html: {str(e)}")
247
+ return None
248
+
249
+ # Gradio Interface
250
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
251
+ gr.Markdown("# Bionic Reading PDF Converter")
252
+
253
+ with gr.Row():
254
+ gr.Image("image.jpeg",
255
+ label="Bionic Reading Example",
256
+ show_label=False,
257
+ width=400,
258
+ height=300)
259
+
260
+
261
+ with gr.Row():
262
+ with gr.Column(scale=2):
263
+ pdf_input = gr.File(
264
+ label="Upload Your PDF",
265
+ file_types=[".pdf"],
266
+ file_count="single"
267
+ )
268
+
269
+ ttf_files = find_ttf_fonts()
270
+ font_dropdown = gr.Dropdown(
271
+ [os.path.basename(font) for font in ttf_files],
272
+ label="Select Font Style",
273
+ value=os.path.basename(ttf_files[0]) if ttf_files else None,
274
+ info="Choose your preferred reading font"
275
+ )
276
+
277
+ convert_pdf_to_html = gr.Button(
278
+ "Convert to Bionic Format",
279
+ variant="primary",
280
+ size="lg"
281
+ )
282
+
283
+ font_output = gr.File(
284
+ label="Download Enhanced HTML File",
285
+ type="filepath"
286
+ )
287
+
288
+ with gr.Row():
289
+ example_files = [
290
+ os.path.join("examples", f)
291
+ for f in os.listdir("examples")
292
+ if f.endswith('.pdf')
293
+ ] if os.path.exists("examples") else []
294
+
295
+ if example_files:
296
+ gr.Examples(
297
+ example_files,
298
+ pdf_input,
299
+ label="Sample PDFs"
300
+ )
301
+
302
+ with gr.Row():
303
+ gr.Markdown(
304
+ """
305
+ ---
306
+ 📝 Best results with text-based PDFs (not scanned documents)
307
+ """
308
+ )
309
+
310
+ convert_pdf_to_html.click(
311
+ pdf_to_html,
312
+ inputs=[pdf_input, font_dropdown],
313
+ outputs=[font_output]
314
+ )
315
+
316
+ app.launch(debug=True)
image.jpeg ADDED
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ pdf2docx
2
+ python-docx
3
+ fpdf
4
+ gradio