File size: 1,433 Bytes
c8a32e7
 
 
 
 
 
 
 
 
 
 
 
 
 
9ddf80f
3cb2b29
c8a32e7
3cb2b29
c8a32e7
 
 
9ddf80f
c8a32e7
3cb2b29
c8a32e7
 
 
 
 
77d131e
 
 
c8a32e7
 
 
5737a5c
c8a32e7
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import base64
from marker.convert import convert_single_pdf
from marker.models import load_all_models
from marker.settings import Settings
import gradio as gr


model_list = load_all_models()

def parse_pdf_and_return_markdown(pdf_file: bytes , extract_images: bool):
    full_text, images, out_meta = convert_single_pdf(pdf_file, model_list)
    image_data = {}
    if extract_images:
        for filename, image in images.items():
            image.save(filename, "PNG")

            with open(filename, "rb") as f:
                image_bytes = f.read()

            image_base64 = base64.b64encode(image_bytes).decode('utf-8')
            image_data[filename] = image_base64

            os.remove(filename)

    return full_text, out_meta, image_data
    

with gr.Blocks() as server:
    gr.Markdown("# Marker: A PDF to Markdown Converter")
    gr.Markdown("This is a tool that converts a PDF file to markdown. It uses a combination of OCR and NLP to extract text and images from the PDF.")
    gr.Markdown("The images are returned as base64 encoded strings. You can use PIL to convert them back to images.")
    gr.Interface(
        parse_pdf_and_return_markdown, 
        inputs=[gr.File(label="Upload PDF", type="filepath"), gr.Checkbox(label="Extract Images")],
        outputs=[gr.Text(label="Markdown"), gr.JSON(label="Metadata"), gr.JSON(label="Images")]
    )


if __name__ == "__main__":
    server.launch()