Spaces:
Running
Running
import gradio as gr | |
from pathlib import Path | |
import pandas as pd | |
import importlib | |
from docling.document_converter import DocumentConverter | |
import llm_document_parser.config as config | |
from llm_document_parser.instructor_llm import extract_json_data_using_ollama_llm, pull_ollama_model | |
from llm_document_parser.convert_doc_docling import ( | |
load_rapid_ocr_model, | |
load_easy_ocr_model, | |
load_ocr_mac_model, | |
load_tesseract_model, | |
image_to_text | |
) | |
from llm_document_parser.export_data import export_as_csv, export_as_json, combine_json_data_into_df, convert_json_to_df | |
print("RUNNING gradio_app.py FROM:", __file__) | |
# Load OCR model based on config | |
def load_ocr_model_from_config(model_type: str) -> DocumentConverter: | |
""" | |
Load the OCR model based on the configuration. | |
Args: | |
model_type (str): The type of OCR model to load. | |
Returns: | |
object: The loaded OCR model. | |
""" | |
if model_type == "rapid": | |
# TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS | |
return load_rapid_ocr_model( | |
"PP-OCRv4/ch_PP-OCRv4_det_server_infer.onnx", | |
"PP-OCRv3/ch_PP-OCRv3_rec_infer.onnx", | |
"PP-OCRv3/ch_ppocr_mobile_v2.0_cls_train.onnx" | |
) | |
if model_type == "easy": | |
return load_easy_ocr_model() | |
if model_type == "ocrmac": | |
return load_ocr_mac_model() | |
if model_type == "tesseract": | |
return load_tesseract_model(config.TESSERACT_TESSDATA_LOCATION) | |
raise ValueError(f"Unknown OCR model type in config: {model_type}") | |
def save_results(export_type: str, output_file_name: str, df: pd.DataFrame, output_folder: str) -> str: | |
""" | |
Save the results in the specified format. | |
Args: | |
export_type (str): The type of export (e.g., "csv"). | |
output_file_name (str): The name of the output file. | |
json_data (str): The JSON data to save. | |
output_folder (str): The folder to save the output file. | |
Returns: | |
output_data (str): The output data from the LLM formatted into the specified format | |
""" | |
if export_type == "csv": | |
return export_as_csv(df=df, output_folder=output_folder, output_file_name=output_file_name) | |
if export_type == "json": | |
return export_as_json(df=df, output_folder=output_folder, output_file_name=output_file_name) | |
return "" | |
def process_file(input_path: Path, document_converter: DocumentConverter) -> str: | |
conversion_result = image_to_text(document_converter, input_path) | |
ocr_text_data = conversion_result.document.export_to_markdown() | |
json_data = extract_json_data_using_ollama_llm( | |
prompt=config.LLM_PROMPT, | |
text_data=ocr_text_data, | |
ollama_model=config.OLLAMA_MODEL, | |
response_model=config.RESPONSE_MODEL | |
) | |
return json_data | |
# Full processing pipeline | |
def run_full_pipeline(file_inputs): | |
document_converter = load_ocr_model_from_config(config.OCR_MODEL) | |
pull_ollama_model(config.OLLAMA_MODEL) | |
df = pd.DataFrame() | |
if type(file_inputs) == list: | |
json_data_objects = list() | |
for file in file_inputs: | |
json_data = process_file(file, document_converter) | |
json_data_objects.append(json_data) | |
df = combine_json_data_into_df(json_data_objects) | |
else: | |
json_data = process_file(Path(file_inputs), document_converter) | |
df = convert_json_to_df(json_data) | |
return save_results(export_type=config.EXPORT_TYPE,output_file_name=config.OUTPUT_FILE_NAME, df=df, output_folder=config.OUTPUT_FOLDER) | |
''' | |
base_dir = Path(os.path.dirname(__file__)) | |
config_file_path = base_dir / "src" / "llm_document_parser" / "config.py" | |
config_file_path = config_file_path.resolve() | |
code_contents = config_file_path.read_text() | |
def load_config(): | |
return config_file_path.read_text() | |
def save_config(updated_config): | |
config_file_path.write_text(updated_config) | |
importlib.reload(config) | |
return "Config updated successfully!" | |
''' | |
with gr.Blocks() as demo: | |
gr.Markdown(f""" | |
# LLM Document Parser | |
Checkout the GitHub repo for this Blueprint: https://github.com/oronadavid/llm-document-parser | |
This app extracts structured data from a document using OCR and a local LLM.\n | |
Selected OCR model: `{config.OCR_MODEL}`\n | |
Selected LLM model: `{config.OLLAMA_MODEL}`\n | |
Export format: `{config.EXPORT_TYPE}`\n | |
Response Model: `{config.RESPONSE_MODEL.__name__}` | |
""") | |
file_input = gr.File(file_types=["image", ".pdf"], file_count="multiple", label="Upload Document(s) (Image/PDF)") | |
run_button = gr.Button("Parse Documents") | |
output_text = gr.JSON(label="Extracted Data") | |
run_button.click(fn=run_full_pipeline, inputs=file_input, outputs=output_text) | |
''' | |
gr.Markdown("""# Config | |
To update the config, make changes, then click "Update Config" below | |
""") | |
config_editor = gr.Code(code_contents, language="python", label="Config") | |
save_config_button = gr.Button("Update Config") | |
status = gr.Textbox(label="Status") | |
demo.load(fn=load_config, outputs=config_editor) | |
save_config_button.click(fn=save_config, inputs=config_editor, outputs=status) | |
''' | |
if __name__ == "__main__": | |
demo.launch(share=True) | |