File size: 6,094 Bytes
2e8fc61
 
 
37e70be
2e8fc61
37e70be
2e8fc61
 
 
 
 
 
37e70be
 
 
 
 
2e8fc61
37e70be
2e8fc61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37e70be
2e8fc61
 
37e70be
 
 
2e8fc61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from typing import Dict, Tuple
import os
import gradio as gr
import torch.cuda
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorDevice
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types import DoclingDocument
from docling.utils import model_downloader
from docling.datamodel.pipeline_options import smolvlm_picture_description

# Download models upon HF space initialization
pipeline_options = PdfPipelineOptions()
if torch.cuda.is_available():
    print("Enabling CUDA Accelerator")
    pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
    pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
if os.getenv("IS_HF_SPACE"):
    print("Downloading models...")
    model_downloader.download_models()


def parse_document(
    file_path: str,
    do_code_enrichment: bool,
    do_formula_enrichment: bool,
    do_picture_classification: bool,
    do_picture_description: bool,
) -> Tuple[DoclingDocument, str]:
    yield None, f"Parsing document... ⏳"

    pipeline_options.do_code_enrichment = do_code_enrichment
    pipeline_options.do_formula_enrichment = do_formula_enrichment
    pipeline_options.generate_picture_images = do_picture_classification
    pipeline_options.images_scale = 2
    pipeline_options.do_picture_classification = do_picture_classification

    pipeline_options.do_picture_description = do_picture_description
    pipeline_options.picture_description_options = smolvlm_picture_description
    pipeline_options.picture_description_options.prompt = "Describe the image in three sentences. Be concise and accurate."
    pipeline_options.images_scale = 2.0
    pipeline_options.generate_picture_images = True

    print(f"Pipeline options defined: \n\t{pipeline_options}")
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    result = converter.convert(file_path)

    yield result.document, "Done ✅"


def to_html(docling_doc: DoclingDocument) -> str:
    return docling_doc.export_to_html()


def to_markdown(docling_doc: DoclingDocument) -> str:
    return docling_doc.export_to_markdown()


def to_json(docling_doc: DoclingDocument) -> Dict:
    return docling_doc.export_to_dict()


def to_text(docling_doc: DoclingDocument) -> str:
    return docling_doc.export_to_text()


def upload_file(file) -> str:
    return file.name


def setup_gradio_demo():
    with gr.Blocks() as demo:
        gr.Markdown(
            """ # Docling - OCR: Parse documents, images, spreadsheets and more to markdown or other formats!
            
            Docling is very powerful tool, with lots of cool features and integrations to other AI frameworks (e.g. LlamaIndex, LangChain, and many more).

            Model used for picture classification: [EfficientNet-B0 Document Image Classifier](https://huggingface.co/ds4sd/DocumentFigureClassifier)

            Model used for picture description: [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)

            To explore the full set of features of Docling visit: https://github.com/docling-project/docling
            """
        )

        with gr.Row():
            with gr.Column():
                gr.Markdown("### 1) Upload")
                file_output = gr.File(
                    file_count="single",
                    file_types=[
                        ".pdf",
                        ".docx",
                        ".pptx",
                        ".csv",
                        ".md",
                        ".png",
                        ".jpg",
                        ".tiff",
                        ".bmp",
                        ".html",
                        ".xhtml",
                        ".xlsx",
                    ],
                )

            with gr.Column():
                gr.Markdown("### 2) Configure engine & Parse")
                code_understanding = gr.Checkbox(
                    value=False, label="Enable Code understanding"
                )
                formula_enrichment = gr.Checkbox(
                    value=False, label="Enable Formula understanding"
                )
                picture_classification = gr.Checkbox(
                    value=False, label="Enable Picture classification"
                )
                picture_description = gr.Checkbox(
                    value=False, label="Enable Picture description"
                )
                gr.Markdown(
                    "_**Warning:** Enabling any of these features can potentially increase the processing time._"
                )

                parse_button = gr.Button("Parse document")
                status = gr.Markdown()
            with gr.Column():
                gr.Markdown("### 3) Convert")

                html_button = gr.Button("Convert to HTML")
                markdown_button = gr.Button("Convert to markdown")
                json_button = gr.Button("Convert to JSON")
                text_button = gr.Button("Convert to text")

        doc = gr.State()
        output = gr.Text(label="Output")

        parse_button.click(
            fn=parse_document,
            inputs=[
                file_output,
                code_understanding,
                formula_enrichment,
                picture_classification,
                picture_description,
            ],
            outputs=[doc, status],
        )
        html_button.click(
            fn=to_html,
            inputs=doc,
            outputs=output,
        )
        markdown_button.click(
            fn=to_markdown,
            inputs=doc,
            outputs=output,
        )
        json_button.click(
            fn=to_json,
            inputs=doc,
            outputs=output,
        )
        text_button.click(
            fn=to_text,
            inputs=doc,
            outputs=output,
        )

    demo.launch()


if __name__ == "__main__":
    setup_gradio_demo()