File size: 823 Bytes
07e0b4e
 
 
 
 
 
 
 
b756001
 
07e0b4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import gradio as gr
import fitz  # PyMuPDF

def pdf_to_markdown(pdf_file):
    """Extract text from a PDF and format it into markdown."""
    if pdf_file is None:
        return "No file uploaded."

    # Open the uploaded PDF file using PyMuPDF
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    markdown_text = []

    for i, page in enumerate(doc):
        text = page.get_text("text")
        markdown_text.append(f"=== Page {i + 1}\n\n{text.strip()}")

    return "\n\n".join(markdown_text)

# Gradio Interface
iface = gr.Interface(
    fn=pdf_to_markdown,
    inputs=gr.File(type="file"),
    outputs=gr.Textbox(label="Markdown Output", lines=15),
    title="PDF to Markdown Extractor",
    description="Upload a PDF and get a copyable markdown output.",
)

if __name__ == "__main__":
    iface.launch()