File size: 5,243 Bytes
f5bd856
3f7e152
 
 
 
f5bd856
3f7e152
f5bd856
3f7e152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
from pathlib import Path
import pandas as pd
import importlib
from docling.document_converter import DocumentConverter

import llm_document_parser.config as config

from llm_document_parser.instructor_llm import extract_json_data_using_ollama_llm, pull_ollama_model
from llm_document_parser.convert_doc_docling import (
    load_rapid_ocr_model,
    load_easy_ocr_model,
    load_ocr_mac_model,
    load_tesseract_model,
    image_to_text
)
from llm_document_parser.export_data import export_as_csv, export_as_json, combine_json_data_into_df, convert_json_to_df

print("RUNNING gradio_app.py FROM:", __file__)

# Load OCR model based on config
def load_ocr_model_from_config(model_type: str) -> DocumentConverter:
    """
    Load the OCR model based on the configuration.
    Args:
        model_type (str): The type of OCR model to load.
    Returns:
        object: The loaded OCR model.
    """
    if model_type == "rapid":
        # TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS
        return load_rapid_ocr_model(
            "PP-OCRv4/ch_PP-OCRv4_det_server_infer.onnx",
            "PP-OCRv3/ch_PP-OCRv3_rec_infer.onnx",
            "PP-OCRv3/ch_ppocr_mobile_v2.0_cls_train.onnx"
        )
    if model_type == "easy":
        return load_easy_ocr_model()
    if model_type == "ocrmac":
        return load_ocr_mac_model()
    if model_type == "tesseract":
        return load_tesseract_model(config.TESSERACT_TESSDATA_LOCATION)

    raise ValueError(f"Unknown OCR model type in config: {model_type}")


def save_results(export_type: str, output_file_name: str, df: pd.DataFrame, output_folder: str) -> str:
    """
    Save the results in the specified format. 
    Args:
        export_type (str): The type of export (e.g., "csv").
        output_file_name (str): The name of the output file.
        json_data (str): The JSON data to save.
        output_folder (str): The folder to save the output file.
    Returns:
        output_data (str): The output data from the LLM formatted into the specified format
    """
    if export_type == "csv":
        return export_as_csv(df=df, output_folder=output_folder, output_file_name=output_file_name)
    if export_type == "json":
        return export_as_json(df=df, output_folder=output_folder, output_file_name=output_file_name)
    
    return ""

def process_file(input_path: Path, document_converter: DocumentConverter) -> str:
    conversion_result = image_to_text(document_converter, input_path)
    ocr_text_data = conversion_result.document.export_to_markdown()

    json_data = extract_json_data_using_ollama_llm(
        prompt=config.LLM_PROMPT,
        text_data=ocr_text_data,
        ollama_model=config.OLLAMA_MODEL,
        response_model=config.RESPONSE_MODEL
    )
    return json_data

# Full processing pipeline
def run_full_pipeline(file_inputs):
    document_converter = load_ocr_model_from_config(config.OCR_MODEL)
    pull_ollama_model(config.OLLAMA_MODEL)

    df = pd.DataFrame()
    if type(file_inputs) == list:
        json_data_objects = list()
        for file in file_inputs:
            json_data = process_file(file, document_converter)
            json_data_objects.append(json_data)
            df = combine_json_data_into_df(json_data_objects)
    else:
        json_data = process_file(Path(file_inputs), document_converter)
        df = convert_json_to_df(json_data)

    return save_results(export_type=config.EXPORT_TYPE,output_file_name=config.OUTPUT_FILE_NAME, df=df, output_folder=config.OUTPUT_FOLDER)
'''
base_dir = Path(os.path.dirname(__file__))
config_file_path = base_dir / "src" / "llm_document_parser" / "config.py"
config_file_path = config_file_path.resolve()
code_contents = config_file_path.read_text()

def load_config():
    return config_file_path.read_text()

def save_config(updated_config):
    config_file_path.write_text(updated_config)
    importlib.reload(config)
    return "Config updated successfully!"
'''

with gr.Blocks() as demo:
    gr.Markdown(f"""
    # LLM Document Parser
    Checkout the GitHub repo for this Blueprint: https://github.com/oronadavid/llm-document-parser

    This app extracts structured data from a document using OCR and a local LLM.\n
    Selected OCR model: `{config.OCR_MODEL}`\n
    Selected LLM model: `{config.OLLAMA_MODEL}`\n
    Export format: `{config.EXPORT_TYPE}`\n
    Response Model: `{config.RESPONSE_MODEL.__name__}`
    """)

    file_input = gr.File(file_types=["image", ".pdf"], file_count="multiple", label="Upload Document(s) (Image/PDF)")

    run_button = gr.Button("Parse Documents")
    output_text = gr.JSON(label="Extracted Data")
    run_button.click(fn=run_full_pipeline, inputs=file_input, outputs=output_text)

    ''' 
    gr.Markdown("""# Config
    To update the config, make changes, then click "Update Config" below
    """)
    config_editor = gr.Code(code_contents, language="python", label="Config")
    save_config_button = gr.Button("Update Config")
    status = gr.Textbox(label="Status")

    demo.load(fn=load_config, outputs=config_editor)
    save_config_button.click(fn=save_config, inputs=config_editor, outputs=status)
    '''

if __name__ == "__main__":
    demo.launch(share=True)