""" DeepSeek-OCR Gradio Interface for Hugging Face Spaces ------------------------------------------------------ Simplified Gradio app optimized for ZeroGPU deployment """ import gradio as gr import torch from PIL import Image import tempfile import os from pathlib import Path import spaces import fitz # PyMuPDF # Initialize model (will be loaded on first use with ZeroGPU) model = None processor = None def load_model(): """Load DeepSeek-OCR model with ZeroGPU""" global model, processor if model is None: from transformers import AutoModelForCausalLM, AutoTokenizer try: # Try importing from backend.process first (for Hugging Face Space) from backend.process.image_process import DeepseekOCRProcessor except ImportError: # Fall back to process.image_process (for local deployment) from process.image_process import DeepseekOCRProcessor model_path = "deepseek-ai/DeepSeek-OCR" print("Loading DeepSeek-OCR model...") processor = DeepseekOCRProcessor.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True ) print("Model loaded successfully!") return model, processor @spaces.GPU(duration=120) def perform_ocr(image, prompt_text): """ Perform OCR on the uploaded image Args: image: PIL Image or file path prompt_text: Custom prompt for OCR task Returns: str: Extracted text or analysis result """ try: # Load model model, processor = load_model() # Handle image input if isinstance(image, str): image = Image.open(image).convert("RGB") elif not isinstance(image, Image.Image): raise ValueError("Invalid image input") # Prepare prompt if not prompt_text or prompt_text.strip() == "": prompt = "\nFree OCR." else: prompt = f"\n{prompt_text}" # Process image inputs = processor.tokenize_with_images( images=[image], prompt=prompt, bos=True, eos=True, cropping=True ) # Move to GPU inputs = {k: v.to(model.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} # Generate with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=2048, do_sample=False, temperature=1.0, top_p=1.0, use_cache=True, ) # Decode output result = processor.tokenizer.decode( outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True ) return result except Exception as e: return f"Error during OCR processing: {str(e)}" @spaces.GPU(duration=180) def process_pdf(pdf_file, prompt_text): """ Process PDF file (extract text from first few pages) Args: pdf_file: Uploaded PDF file path (string) prompt_text: Custom prompt for OCR task Returns: str: Extracted text from PDF pages """ try: # Validate file upload if pdf_file is None or pdf_file == "": return "❌ Please upload a PDF file first." # pdf_file is now a filepath string pdf_path = pdf_file # Check if file exists if not os.path.exists(pdf_path): return f"❌ File not found: {pdf_path}" # Open PDF pdf_document = fitz.open(pdf_path) total_pages = len(pdf_document) if total_pages == 0: pdf_document.close() return "❌ PDF file is empty (0 pages)." results = [] # Process first 3 pages (to avoid timeout) max_pages = min(3, total_pages) for page_num in range(max_pages): page = pdf_document[page_num] # Convert page to image pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x resolution img_data = pix.tobytes("png") # Save to temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp: tmp.write(img_data) tmp_path = tmp.name # Perform OCR image = Image.open(tmp_path) result = perform_ocr(image, prompt_text) results.append(f"--- Page {page_num + 1} ---\n{result}\n") # Cleanup os.unlink(tmp_path) # Close PDF before checking total_pages pdf_document.close() # Add note if PDF has more pages if max_pages < total_pages: results.append(f"\n(Only first {max_pages} pages processed. Full PDF has {total_pages} pages)") return "\n".join(results) except Exception as e: import traceback error_details = traceback.format_exc() return f"❌ Error processing PDF: {str(e)}\n\nPlease make sure you uploaded a valid PDF file." # Create Gradio Interface with gr.Blocks(title="DeepSeek-OCR Studio", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🔍 DeepSeek-OCR Studio Advanced OCR system supporting: - 📝 Multi-language text recognition (Chinese, English, etc.) - 📊 Table & chart extraction - 🎨 Professional drawing analysis (CAD, flowcharts) - 📄 PDF document processing & OCR - 📐 Layout analysis & Markdown conversion **Note**: Running on ZeroGPU - first request may take longer to load the model. """) with gr.Tab("Image OCR"): with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Upload Image") image_prompt = gr.Textbox( label="Custom Prompt (Optional)", placeholder="Free OCR.", value="Free OCR.", lines=2 ) image_btn = gr.Button("Extract Text", variant="primary") with gr.Column(): image_output = gr.Textbox( label="Extracted Text", lines=20, show_copy_button=True ) image_btn.click( fn=perform_ocr, inputs=[image_input, image_prompt], outputs=image_output ) gr.Examples( examples=[ ["examples/sample1.png", "Free OCR."], ["examples/sample2.jpg", "Extract all text and tables."], ], inputs=[image_input, image_prompt], label="Example Images (if available)" ) with gr.Tab("PDF OCR"): with gr.Row(): with gr.Column(): pdf_input = gr.File( label="Upload PDF", file_types=[".pdf"], type="filepath" ) pdf_prompt = gr.Textbox( label="Custom Prompt (Optional)", placeholder="Free OCR.", value="Free OCR.", lines=2 ) pdf_btn = gr.Button("Process PDF (First 3 Pages)", variant="primary") with gr.Column(): pdf_output = gr.Textbox( label="Extracted Text", lines=20, show_copy_button=True ) pdf_btn.click( fn=process_pdf, inputs=[pdf_input, pdf_prompt], outputs=pdf_output ) with gr.Tab("Advanced Prompts"): gr.Markdown(""" ### Prompt Examples **Basic OCR:** ``` Free OCR. ``` **Table Extraction:** ``` Extract all tables and convert to markdown format. ``` **Chart Analysis:** ``` Analyze this chart and extract data in table format. ``` **Multi-language:** ``` Extract all text in multiple languages. ``` **CAD Drawing:** ``` Analyze this technical drawing and describe its components. ``` """) gr.Markdown(""" --- ### About - **Model**: [DeepSeek-OCR](https://huggingface.co/deepseek-ai/DeepSeek-OCR) - **Project**: [DeepSeek-OCR-Web](https://github.com/fufankeji/DeepSeek-OCR-Web) - **GPU**: ZeroGPU (Hugging Face Spaces) ### Features - 🔍 **Image OCR**: Upload images for text extraction - 📄 **PDF OCR**: Extract text from PDF documents (first 3 pages) - 📊 **Table & Chart**: Extract tables and analyze charts - 🌍 **Multi-language**: Support for 100+ languages ### Note - Processing time: 30-120 seconds per image/page - PDF OCR limited to first 3 pages on ZeroGPU - For full functionality, deploy locally with GPU """) if __name__ == "__main__": demo.queue(max_size=20) demo.launch()