Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on 5 days ago

Commit

e9af7f8

verified ·

1 Parent(s): 4366a57

Changes output to HTML (not EPUB... ease of formatting etc...)

Browse files

Files changed (1) hide show

app.py +37 -36

app.py CHANGED Viewed

@@ -25,17 +25,13 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-def process_pdf_to_epub(pdf_file, title, author):
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
     num_pages = len(doc)
-    book = epub.EpubBook()
-    book.set_identifier("id123456")
-    book.set_title(title)
-    book.add_author(author)
     all_text = ""
     for i in range(num_pages):
         page_num = i + 1
@@ -92,7 +88,6 @@ def process_pdf_to_epub(pdf_file, title, author):
                     raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
                     try:
                         parsed = json.loads(raw_output)
-                        # Only include `natural_text`, drop undesired metadata
                         decoded = parsed.get("natural_text", raw_output)
                     except json.JSONDecodeError:
                         decoded = raw_output
@@ -106,59 +101,65 @@ def process_pdf_to_epub(pdf_file, title, author):
         print(f"Decoded content for page {page_num}: {decoded}")
-        # Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
-        converted = convert_inline_and_block_latex_to_mathml(decoded)
-        converted = converted.replace("\n", "<br>")  # Optional: preserve line breaks
-        all_text += f"<div>{converted}</div>"
         if page_num == 1:
-            cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
-            cover_io = BytesIO()
-            cover_image.save(cover_io, format='PNG')
-            book.set_cover("cover.png", cover_io.getvalue())
-    single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
     mathjax_script = """
     <script type="text/javascript" id="MathJax-script" async
       src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
     </script>
     """
-    single_chapter.content = f"""<!DOCTYPE html>
     <html>
-      <head>
-        <meta charset="utf-8"/>
         <title>{html.escape(title)}</title>
         {mathjax_script}
-      </head>
-      <body>
         <h1>{html.escape(title)}</h1>
         {all_text}
-      </body>
     </html>
     """
-    book.add_item(single_chapter)
-    book.toc = (single_chapter,)
-    book.spine = ['nav', single_chapter]
-    book.add_item(epub.EpubNcx())
-    book.add_item(epub.EpubNav())
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
-        epub.write_epub(tmp.name, book)
         return tmp.name
 # Gradio Interface
 iface = gr.Interface(
-    fn=process_pdf_to_epub,
     inputs=[
         gr.File(label="Upload PDF", file_types=[".pdf"]),
-        gr.Textbox(label="EPUB Title"),
         gr.Textbox(label="Author(s)")
     ],
-    outputs=gr.File(label="Download EPUB"),
-    title="PDF to EPUB Converter (with olmOCR)",
-    description="Uploads a PDF, extracts text from each page with vision + prompt, and builds an EPUB using the outputs. Sets the first page as cover.",
     allow_flagging="never"
 )

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+def process_pdf_to_html(pdf_file, title, author):
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
     num_pages = len(doc)
     all_text = ""
+    cover_img_html = ""
     for i in range(num_pages):
         page_num = i + 1
                     raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
                     try:
                         parsed = json.loads(raw_output)
                         decoded = parsed.get("natural_text", raw_output)
                     except json.JSONDecodeError:
                         decoded = raw_output
         print(f"Decoded content for page {page_num}: {decoded}")
+        from latex2mathml.converter import convert as latex_to_mathml
+        def convert_latex(text):
+            import re
+            def replacer(match):
+                try:
+                    return f"<math>{latex_to_mathml(match.group(1))}</math>"
+                except:
+                    return html.escape(match.group(0))
+            # Convert \( ... \)
+            text = re.sub(r'\\\((.*?)\\\)', replacer, text)
+            # Convert \[ ... \]
+            text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
+            return text
+        safe_html = html.escape(decoded).replace("\n", "<br>")
+        mathml_html = convert_latex(safe_html)
+        all_text += f"<div>{mathml_html}</div>\n"
         if page_num == 1:
+            cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
     mathjax_script = """
     <script type="text/javascript" id="MathJax-script" async
       src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
     </script>
     """
+    full_html = f"""<!DOCTYPE html>
     <html>
+    <head>
+        <meta charset="utf-8">
         <title>{html.escape(title)}</title>
         {mathjax_script}
+    </head>
+    <body>
         <h1>{html.escape(title)}</h1>
+        <h3>{html.escape(author)}</h3>
+        {cover_img_html}
         {all_text}
+    </body>
     </html>
     """
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
+        tmp.write(full_html)
         return tmp.name
 # Gradio Interface
 iface = gr.Interface(
+    fn=process_pdf_to_html,  # NEW FUNCTION
     inputs=[
         gr.File(label="Upload PDF", file_types=[".pdf"]),
+        gr.Textbox(label="HTML Title"),
         gr.Textbox(label="Author(s)")
     ],
+    outputs=gr.File(label="Download HTML"),
+    title="PDF to HTML Converter (for Calibre/Kindle)",
+    description="Uploads a PDF, extracts text via vision+prompt, embeds it in a styled HTML file with math support. Ready for Calibre.",
     allow_flagging="never"
 )