Spaces:

salverz
/

llm-document-parser

Running

App Files Files Community

llm-document-parser / app.py

salverz

Add project files

3f7e152 about 1 month ago

raw

history blame contribute delete

5.24 kB

	import gradio as gr
	from pathlib import Path
	import pandas as pd
	import importlib
	from docling.document_converter import DocumentConverter

	import llm_document_parser.config as config

	from llm_document_parser.instructor_llm import extract_json_data_using_ollama_llm, pull_ollama_model
	from llm_document_parser.convert_doc_docling import (
	load_rapid_ocr_model,
	load_easy_ocr_model,
	load_ocr_mac_model,
	load_tesseract_model,
	image_to_text
	)
	from llm_document_parser.export_data import export_as_csv, export_as_json, combine_json_data_into_df, convert_json_to_df

	print("RUNNING gradio_app.py FROM:", __file__)

	# Load OCR model based on config
	def load_ocr_model_from_config(model_type: str) -> DocumentConverter:
	"""
	Load the OCR model based on the configuration.
	Args:
	model_type (str): The type of OCR model to load.
	Returns:
	object: The loaded OCR model.
	"""
	if model_type == "rapid":
	# TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS
	return load_rapid_ocr_model(
	"PP-OCRv4/ch_PP-OCRv4_det_server_infer.onnx",
	"PP-OCRv3/ch_PP-OCRv3_rec_infer.onnx",
	"PP-OCRv3/ch_ppocr_mobile_v2.0_cls_train.onnx"
	)
	if model_type == "easy":
	return load_easy_ocr_model()
	if model_type == "ocrmac":
	return load_ocr_mac_model()
	if model_type == "tesseract":
	return load_tesseract_model(config.TESSERACT_TESSDATA_LOCATION)

	raise ValueError(f"Unknown OCR model type in config: {model_type}")


	def save_results(export_type: str, output_file_name: str, df: pd.DataFrame, output_folder: str) -> str:
	"""
	Save the results in the specified format.
	Args:
	export_type (str): The type of export (e.g., "csv").
	output_file_name (str): The name of the output file.
	json_data (str): The JSON data to save.
	output_folder (str): The folder to save the output file.
	Returns:
	output_data (str): The output data from the LLM formatted into the specified format
	"""
	if export_type == "csv":
	return export_as_csv(df=df, output_folder=output_folder, output_file_name=output_file_name)
	if export_type == "json":
	return export_as_json(df=df, output_folder=output_folder, output_file_name=output_file_name)

	return ""

	def process_file(input_path: Path, document_converter: DocumentConverter) -> str:
	conversion_result = image_to_text(document_converter, input_path)
	ocr_text_data = conversion_result.document.export_to_markdown()

	json_data = extract_json_data_using_ollama_llm(
	prompt=config.LLM_PROMPT,
	text_data=ocr_text_data,
	ollama_model=config.OLLAMA_MODEL,
	response_model=config.RESPONSE_MODEL
	)
	return json_data

	# Full processing pipeline
	def run_full_pipeline(file_inputs):
	document_converter = load_ocr_model_from_config(config.OCR_MODEL)
	pull_ollama_model(config.OLLAMA_MODEL)

	df = pd.DataFrame()
	if type(file_inputs) == list:
	json_data_objects = list()
	for file in file_inputs:
	json_data = process_file(file, document_converter)
	json_data_objects.append(json_data)
	df = combine_json_data_into_df(json_data_objects)
	else:
	json_data = process_file(Path(file_inputs), document_converter)
	df = convert_json_to_df(json_data)

	return save_results(export_type=config.EXPORT_TYPE,output_file_name=config.OUTPUT_FILE_NAME, df=df, output_folder=config.OUTPUT_FOLDER)
	'''
	base_dir = Path(os.path.dirname(__file__))
	config_file_path = base_dir / "src" / "llm_document_parser" / "config.py"
	config_file_path = config_file_path.resolve()
	code_contents = config_file_path.read_text()

	def load_config():
	return config_file_path.read_text()

	def save_config(updated_config):
	config_file_path.write_text(updated_config)
	importlib.reload(config)
	return "Config updated successfully!"
	'''

	with gr.Blocks() as demo:
	gr.Markdown(f"""
	# LLM Document Parser
	Checkout the GitHub repo for this Blueprint: https://github.com/oronadavid/llm-document-parser

	This app extracts structured data from a document using OCR and a local LLM.\n
	Selected OCR model: `{config.OCR_MODEL}`\n
	Selected LLM model: `{config.OLLAMA_MODEL}`\n
	Export format: `{config.EXPORT_TYPE}`\n
	Response Model: `{config.RESPONSE_MODEL.__name__}`
	""")

	file_input = gr.File(file_types=["image", ".pdf"], file_count="multiple", label="Upload Document(s) (Image/PDF)")

	run_button = gr.Button("Parse Documents")
	output_text = gr.JSON(label="Extracted Data")
	run_button.click(fn=run_full_pipeline, inputs=file_input, outputs=output_text)

	'''
	gr.Markdown("""# Config
	To update the config, make changes, then click "Update Config" below
	""")
	config_editor = gr.Code(code_contents, language="python", label="Config")
	save_config_button = gr.Button("Update Config")
	status = gr.Textbox(label="Status")

	demo.load(fn=load_config, outputs=config_editor)
	save_config_button.click(fn=save_config, inputs=config_editor, outputs=status)
	'''

	if __name__ == "__main__":
	demo.launch(share=True)