Spaces:

IFMedTechdemo
/

Multi-Model-OCR

Runtime error

App Files Files Community

Multi-Model-OCR / app.py

IFMedTechdemo

Update app.py

6b8b7fc verified about 2 months ago

raw

history blame

8.34 kB

	import os
	import time
	import torch
	from threading import Thread
	from PIL import Image
	from transformers import (
	AutoProcessor,
	AutoModelForCausalLM,
	Qwen2_5_VLForConditionalGeneration,
	TextIteratorStreamer
	)
	from qwen_vl_utils import process_vision_info

	# Try importing Qwen3VL if available
	try:
	from transformers import Qwen3VLForConditionalGeneration
	except ImportError:
	Qwen3VLForConditionalGeneration = None

	MAX_MAX_NEW_TOKENS = 4096
	DEFAULT_MAX_NEW_TOKENS = 2048
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	# Load Chandra-OCR
	MODEL_ID_V = "datalab-to/chandra"
	processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
	if Qwen3VLForConditionalGeneration:
	model_v = Qwen3VLForConditionalGeneration.from_pretrained(
	MODEL_ID_V,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to(device).eval()
	else:
	model_v = None

	# Load Nanonets-OCR2-3B
	MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
	processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
	model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID_X,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to(device).eval()

	# Load Dots.OCR from the local, patched directory
	MODEL_PATH_D = "strangervisionhf/dots.ocr-base-fix"
	processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
	model_d = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH_D,
	attn_implementation="flash_attention_2",
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	).eval()

	# Load olmOCR-2-7B-1025
	MODEL_ID_M = "allenai/olmOCR-2-7B-1025"
	processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
	model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID_M,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to(device).eval()

	# Load DeepSeek-OCR
	MODEL_ID_DS = "deepseek-ai/deepseek-ocr"
	processor_ds = AutoProcessor.from_pretrained(MODEL_ID_DS, trust_remote_code=True)
	model_ds = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID_DS,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to(device).eval()

	@spaces.GPU
	def generate_image(model_name: str, text: str, image: Image.Image,
	max_new_tokens: int, temperature: float, top_p: float,
	top_k: int, repetition_penalty: float):
	"""
	Generates responses using the selected model for image input.
	Yields raw text and Markdown-formatted text.

	Args:
	model_name: Name of the OCR model to use
	text: Prompt text for the model
	image: PIL Image object to process
	max_new_tokens: Maximum number of tokens to generate
	temperature: Sampling temperature
	top_p: Nucleus sampling parameter
	top_k: Top-k sampling parameter
	repetition_penalty: Penalty for repeating tokens

	Yields:
	tuple: (raw_text, markdown_text)
	"""
	# Select model and processor based on model_name
	if model_name == "olmOCR-2-7B-1025":
	processor = processor_m
	model = model_m
	elif model_name == "Nanonets-OCR2-3B":
	processor = processor_x
	model = model_x
	elif model_name == "Chandra-OCR":
	if model_v is None:
	yield "Chandra-OCR model not available.", "Chandra-OCR model not available."
	return
	processor = processor_v
	model = model_v
	elif model_name == "Dots.OCR":
	processor = processor_d
	model = model_d
	elif model_name == "DeepSeek-OCR":
	processor = processor_ds
	model = model_ds
	else:
	yield "Invalid model selected.", "Invalid model selected."
	return

	if image is None:
	yield "Please upload an image.", "Please upload an image."
	return

	# Prepare messages in chat format
	messages = [{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": text},
	]
	}]

	# Apply chat template
	prompt_full = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Process inputs
	inputs = processor(
	text=[prompt_full],
	images=[image],
	return_tensors="pt",
	padding=True
	).to(device)

	# Setup streaming generation
	streamer = TextIteratorStreamer(
	processor,
	skip_prompt=True,
	skip_special_tokens=True
	)

	generation_kwargs = {
	**inputs,
	"streamer": streamer,
	"max_new_tokens": max_new_tokens,
	"do_sample": True,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"repetition_penalty": repetition_penalty,
	}

	# Start generation in separate thread
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	# Stream the results
	buffer = ""
	for new_text in streamer:
	buffer += new_text
	buffer = buffer.replace("<\|im_end\|>", "")
	time.sleep(0.01)
	yield buffer, buffer

	# Ensure thread completes
	thread.join()


	# Example usage for Gradio interface
	if __name__ == "__main__":
	import gradio as gr

	with gr.Blocks() as demo:
	gr.Markdown("# Multi-Model OCR Application")
	gr.Markdown("Upload an image and select a model to extract text")

	with gr.Row():
	with gr.Column():
	model_selector = gr.Dropdown(
	choices=[
	"olmOCR-2-7B-1025",
	"Nanonets-OCR2-3B",
	"Chandra-OCR",
	"Dots.OCR",
	"DeepSeek-OCR"
	],
	value="DeepSeek-OCR",
	label="Select OCR Model"
	)
	image_input = gr.Image(type="pil", label="Upload Image")
	text_input = gr.Textbox(
	value="Extract all text from this image.",
	label="Prompt"
	)

	with gr.Accordion("Advanced Settings", open=False):
	max_tokens = gr.Slider(
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	value=DEFAULT_MAX_NEW_TOKENS,
	step=1,
	label="Max New Tokens"
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature"
	)
	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top P"
	)
	top_k = gr.Slider(
	minimum=1,
	maximum=100,
	value=50,
	step=1,
	label="Top K"
	)
	repetition_penalty = gr.Slider(
	minimum=1.0,
	maximum=2.0,
	value=1.1,
	step=0.1,
	label="Repetition Penalty"
	)

	submit_btn = gr.Button("Extract Text", variant="primary")

	with gr.Column():
	output_text = gr.Textbox(label="Extracted Text", lines=20)
	output_markdown = gr.Markdown(label="Formatted Output")

	submit_btn.click(
	fn=generate_image,
	inputs=[
	model_selector,
	text_input,
	image_input,
	max_tokens,
	temperature,
	top_p,
	top_k,
	repetition_penalty
	],
	outputs=[output_text, output_markdown]
	)

	demo.launch()