leonarb commited on
Commit
e9af7f8
·
verified ·
1 Parent(s): 4366a57

Changes output to HTML (not EPUB... ease of formatting etc...)

Browse files
Files changed (1) hide show
  1. app.py +37 -36
app.py CHANGED
@@ -25,17 +25,13 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
25
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
  model.to(device)
27
 
28
- def process_pdf_to_epub(pdf_file, title, author):
29
  pdf_path = pdf_file.name
30
  doc = fitz.open(pdf_path)
31
  num_pages = len(doc)
32
 
33
- book = epub.EpubBook()
34
- book.set_identifier("id123456")
35
- book.set_title(title)
36
- book.add_author(author)
37
-
38
  all_text = ""
 
39
 
40
  for i in range(num_pages):
41
  page_num = i + 1
@@ -92,7 +88,6 @@ def process_pdf_to_epub(pdf_file, title, author):
92
  raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
93
  try:
94
  parsed = json.loads(raw_output)
95
- # Only include `natural_text`, drop undesired metadata
96
  decoded = parsed.get("natural_text", raw_output)
97
  except json.JSONDecodeError:
98
  decoded = raw_output
@@ -106,59 +101,65 @@ def process_pdf_to_epub(pdf_file, title, author):
106
 
107
  print(f"Decoded content for page {page_num}: {decoded}")
108
 
109
- # Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
110
- converted = convert_inline_and_block_latex_to_mathml(decoded)
111
- converted = converted.replace("\n", "<br>") # Optional: preserve line breaks
112
- all_text += f"<div>{converted}</div>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  if page_num == 1:
115
- cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
116
- cover_io = BytesIO()
117
- cover_image.save(cover_io, format='PNG')
118
- book.set_cover("cover.png", cover_io.getvalue())
119
 
120
- single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
121
  mathjax_script = """
122
  <script type="text/javascript" id="MathJax-script" async
123
  src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
124
  </script>
125
  """
126
-
127
- single_chapter.content = f"""<!DOCTYPE html>
128
  <html>
129
- <head>
130
- <meta charset="utf-8"/>
131
  <title>{html.escape(title)}</title>
132
  {mathjax_script}
133
- </head>
134
- <body>
135
  <h1>{html.escape(title)}</h1>
 
 
136
  {all_text}
137
- </body>
138
  </html>
139
  """
140
 
141
- book.add_item(single_chapter)
142
- book.toc = (single_chapter,)
143
- book.spine = ['nav', single_chapter]
144
- book.add_item(epub.EpubNcx())
145
- book.add_item(epub.EpubNav())
146
-
147
- with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
148
- epub.write_epub(tmp.name, book)
149
  return tmp.name
150
 
151
  # Gradio Interface
152
  iface = gr.Interface(
153
- fn=process_pdf_to_epub,
154
  inputs=[
155
  gr.File(label="Upload PDF", file_types=[".pdf"]),
156
- gr.Textbox(label="EPUB Title"),
157
  gr.Textbox(label="Author(s)")
158
  ],
159
- outputs=gr.File(label="Download EPUB"),
160
- title="PDF to EPUB Converter (with olmOCR)",
161
- description="Uploads a PDF, extracts text from each page with vision + prompt, and builds an EPUB using the outputs. Sets the first page as cover.",
162
  allow_flagging="never"
163
  )
164
 
 
25
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
  model.to(device)
27
 
28
+ def process_pdf_to_html(pdf_file, title, author):
29
  pdf_path = pdf_file.name
30
  doc = fitz.open(pdf_path)
31
  num_pages = len(doc)
32
 
 
 
 
 
 
33
  all_text = ""
34
+ cover_img_html = ""
35
 
36
  for i in range(num_pages):
37
  page_num = i + 1
 
88
  raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
89
  try:
90
  parsed = json.loads(raw_output)
 
91
  decoded = parsed.get("natural_text", raw_output)
92
  except json.JSONDecodeError:
93
  decoded = raw_output
 
101
 
102
  print(f"Decoded content for page {page_num}: {decoded}")
103
 
104
+ from latex2mathml.converter import convert as latex_to_mathml
105
+
106
+ def convert_latex(text):
107
+ import re
108
+ def replacer(match):
109
+ try:
110
+ return f"<math>{latex_to_mathml(match.group(1))}</math>"
111
+ except:
112
+ return html.escape(match.group(0))
113
+ # Convert \( ... \)
114
+ text = re.sub(r'\\\((.*?)\\\)', replacer, text)
115
+ # Convert \[ ... \]
116
+ text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
117
+ return text
118
+
119
+ safe_html = html.escape(decoded).replace("\n", "<br>")
120
+ mathml_html = convert_latex(safe_html)
121
+ all_text += f"<div>{mathml_html}</div>\n"
122
 
123
  if page_num == 1:
124
+ cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
 
 
 
125
 
 
126
  mathjax_script = """
127
  <script type="text/javascript" id="MathJax-script" async
128
  src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
129
  </script>
130
  """
131
+
132
+ full_html = f"""<!DOCTYPE html>
133
  <html>
134
+ <head>
135
+ <meta charset="utf-8">
136
  <title>{html.escape(title)}</title>
137
  {mathjax_script}
138
+ </head>
139
+ <body>
140
  <h1>{html.escape(title)}</h1>
141
+ <h3>{html.escape(author)}</h3>
142
+ {cover_img_html}
143
  {all_text}
144
+ </body>
145
  </html>
146
  """
147
 
148
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
149
+ tmp.write(full_html)
 
 
 
 
 
 
150
  return tmp.name
151
 
152
  # Gradio Interface
153
  iface = gr.Interface(
154
+ fn=process_pdf_to_html, # NEW FUNCTION
155
  inputs=[
156
  gr.File(label="Upload PDF", file_types=[".pdf"]),
157
+ gr.Textbox(label="HTML Title"),
158
  gr.Textbox(label="Author(s)")
159
  ],
160
+ outputs=gr.File(label="Download HTML"),
161
+ title="PDF to HTML Converter (for Calibre/Kindle)",
162
+ description="Uploads a PDF, extracts text via vision+prompt, embeds it in a styled HTML file with math support. Ready for Calibre.",
163
  allow_flagging="never"
164
  )
165