Spaces:

coderprabhat
/

olmOCR

Runtime error

olmOCR / app.py

coderprabhat

Add olmOCR Gradio app for Hugging Face Spaces deployment

322bbf8 about 1 month ago

5.86 kB

	import torch
	import base64
	import gradio as gr
	from io import BytesIO
	from PIL import Image
	from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
	from olmocr.data.renderpdf import render_pdf_to_base64png
	from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
	import warnings
	warnings.filterwarnings('ignore')

	# Initialize the model with CPU optimizations
	print("Loading model... This may take a few minutes on CPU")
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	"allenai/olmOCR-2-7B-1025",
	torch_dtype=torch.float32, # Use float32 for CPU
	low_cpu_mem_usage=True, # Optimize memory usage
	).eval()

	processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
	device = torch.device("cpu")
	model.to(device)
	print("Model loaded successfully")

	def process_document(file, page_number, max_tokens):
	"""
	Process a PDF or image file and extract text using olmOCR

	Args:
	file: Uploaded file (PDF, PNG, or JPEG)
	page_number: Page number to process (for PDFs)
	max_tokens: Maximum number of tokens to generate

	Returns:
	Extracted text output and processed image
	"""
	if file is None:
	return "Please upload a file first.", None

	try:
	# Handle different file types
	if file.name.endswith('.pdf'):
	# Render PDF page to base64 image with smaller size for CPU
	image_base64 = render_pdf_to_base64png(
	file.name,
	page_number,
	target_longest_image_dim=1024 # Reduced from 1288 for CPU
	)
	main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
	else:
	# Handle image files directly
	main_image = Image.open(file.name)
	# Resize large images for CPU efficiency
	max_size = 1024
	if max(main_image.size) > max_size:
	main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)

	buffered = BytesIO()
	main_image.save(buffered, format="PNG")
	image_base64 = base64.b64encode(buffered.getvalue()).decode()

	# Build the full prompt
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
	],
	}
	]

	# Apply the chat template and processor
	text = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	inputs = processor(
	text=[text],
	images=[main_image],
	padding=True,
	return_tensors="pt",
	)
	inputs = {key: value.to(device) for (key, value) in inputs.items()}

	# Generate with CPU-optimized settings
	with torch.no_grad(): # Disable gradient computation for inference
	output = model.generate(
	**inputs,
	temperature=0.1,
	max_new_tokens=max_tokens,
	num_return_sequences=1,
	do_sample=False, # Greedy decoding is faster on CPU
	num_beams=1, # No beam search for speed
	)

	# Decode the output
	prompt_length = inputs["input_ids"].shape[1]
	new_tokens = output[:, prompt_length:]
	text_output = processor.tokenizer.batch_decode(
	new_tokens, skip_special_tokens=True
	)

	return text_output[0], main_image

	except Exception as e:
	return f"Error processing file: {str(e)}", None

	# Create Gradio interface
	with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
	gr.Markdown("# olmOCR: Document OCR with Vision Language Models")
	gr.Markdown("""
	Upload a PDF or image file to extract text using the olmOCR model.

	⚠️ Note: Running on CPU - processing may take 30-90 seconds per page.
	""")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload Document (PDF, PNG, or JPEG)",
	file_types=[".pdf", ".png", ".jpg", ".jpeg"]
	)
	page_number = gr.Slider(
	minimum=1,
	maximum=50,
	value=1,
	step=1,
	label="Page Number (for PDFs)"
	)
	max_tokens = gr.Slider(
	minimum=100,
	maximum=1024, # Reduced max for CPU
	value=512,
	step=50,
	label="Max Tokens"
	)
	process_btn = gr.Button("Extract Text", variant="primary")

	gr.Markdown("""
	### Tips for CPU Usage:
	- Smaller images process faster
	- First run may be slower (model loading)
	- Reduce max tokens for faster results
	""")

	with gr.Column():
	output_text = gr.Textbox(
	label="Extracted Text",
	lines=20,
	placeholder="Extracted text will appear here...\n\nProcessing on CPU may take 30-90 seconds."
	)
	output_image = gr.Image(label="Processed Image")

	process_btn.click(
	fn=process_document,
	inputs=[file_input, page_number, max_tokens],
	outputs=[output_text, output_image]
	)

	gr.Examples(
	examples=[],
	inputs=[file_input]
	)

	if __name__ == "__main__":
	demo.queue(max_size=3) # Limit queue to prevent overload
	demo.launch(server_name="0.0.0.0", server_port=7860)