import gradio as gr from transformers import AutoModel, AutoTokenizer import torch import spaces import os import sys import tempfile import shutil from PIL import Image, ImageDraw, ImageFont, ImageOps import fitz import re import warnings import numpy as np import base64 from io import StringIO, BytesIO import time import json # Model Configuration MODEL_NAME = 'deepseek-ai/DeepSeek-OCR' # Device configuration DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Using device: {DEVICE}") try: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) # Choose dtype based on device dtype = torch.float16 if DEVICE.type == 'cuda' else torch.float32 # Try flash attention first, fall back to standard attention if not available try: model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=dtype, trust_remote_code=True, use_safetensors=True) except Exception: print("Flash attention not available, using standard attention") model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=dtype, trust_remote_code=True, use_safetensors=True) model = model.eval().to(DEVICE) MODEL_LOADED = True except Exception as e: print(f"Warning: Could not load model - {e}") MODEL_LOADED = False # Enhanced Model Configurations MODEL_CONFIGS = { "⚔ Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True, "description": "Best balance - 1024 base + 640 tiles with cropping"}, "šŸš€ Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False, "description": "Fastest - 512Ɨ512, no crop"}, "šŸ“„ Small": {"base_size": 640, "image_size": 640, "crop_mode": False, "description": "Quick - 640Ɨ640, no crop"}, "šŸ“Š Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False, "description": "Standard - 1024Ɨ1024, no crop"}, "šŸŽÆ Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False, "description": "Highest quality - 1280Ɨ1280, no crop"} } # Multi-language Support SUPPORTED_LANGUAGES = { "šŸŒ Auto-Detect": {"code": "auto", "prompt_suffix": ""}, "šŸ‡ŗšŸ‡ø English": {"code": "en", "prompt_suffix": " Extract text in English."}, "šŸ‡øšŸ‡¦ Arabic": {"code": "ar", "prompt_suffix": " Extract text in Arabic. Handle right-to-left text properly."}, "šŸ‡µšŸ‡° Urdu": {"code": "ur", "prompt_suffix": " Extract text in Urdu. Handle right-to-left text properly."}, "šŸ‡ØšŸ‡³ Chinese": {"code": "zh", "prompt_suffix": " Extract text in Chinese."}, "šŸ‡ÆšŸ‡µ Japanese": {"code": "ja", "prompt_suffix": " Extract text in Japanese."}, "šŸ‡°šŸ‡· Korean": {"code": "ko", "prompt_suffix": " Extract text in Korean."}, "šŸ‡ŖšŸ‡ø Spanish": {"code": "es", "prompt_suffix": " Extract text in Spanish."}, "šŸ‡«šŸ‡· French": {"code": "fr", "prompt_suffix": " Extract text in French."}, "šŸ‡©šŸ‡Ŗ German": {"code": "de", "prompt_suffix": " Extract text in German."}, "šŸ‡®šŸ‡³ Hindi": {"code": "hi", "prompt_suffix": " Extract text in Hindi."}, "šŸ‡·šŸ‡ŗ Russian": {"code": "ru", "prompt_suffix": " Extract text in Russian."} } # Enhanced Task Prompts TASK_PROMPTS = { "šŸ“‹ Markdown": {"prompt": "\n<|grounding|>Convert the document to markdown.", "has_grounding": True, "description": "Convert document to structured markdown with grounding"}, "šŸ“ Free OCR": {"prompt": "\nExtract all text from this image.", "has_grounding": False, "description": "Simple text extraction"}, "šŸ“ Locate": {"prompt": "\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True, "description": "Find specific text with bounding boxes"}, "šŸ” Describe": {"prompt": "\nDescribe this image in detail.", "has_grounding": False, "description": "General image description"}, "āœļø Handwritten": {"prompt": "\n<|grounding|>Extract handwritten text from this image.", "has_grounding": True, "description": "Specialized handwritten text extraction"}, "šŸ“Š Table Extract": {"prompt": "\n<|grounding|>Extract table data and convert to markdown table format.", "has_grounding": True, "description": "Extract and format table data"}, "āœļø Custom": {"prompt": "", "has_grounding": False, "description": "Your own custom prompt"} } def extract_grounding_references(text): """Extract grounding references from text""" pattern = r'(.*?)' return re.findall(pattern, text) def draw_bounding_boxes(image, refs, extract_images=False, show_confidence=True): """ Enhanced bounding box drawing with confidence scores and better styling """ if not refs: return image, [] # Try to load a better font try: font = ImageFont.truetype("arial.ttf", 16) small_font = ImageFont.truetype("arial.ttf", 12) except: font = ImageFont.load_default() small_font = ImageFont.load_default() # Create a copy of the image img_with_boxes = image.copy() draw = ImageDraw.Draw(img_with_boxes) # Color map for different types of text color_map = { 'text': '#FF6B6B', # Red for general text 'table': '#4ECDC4', # Teal for tables 'handwritten': '#45B7D1', # Blue for handwritten 'title': '#96CEB4', # Green for titles 'default': '#FFEAA7' # Yellow for others } cropped_images = [] for i, ref in enumerate(refs): # Parse coordinates coords = [int(x) for x in ref['ref_seg']] x1, y1, x2, y2 = coords # Simulate confidence score (in real implementation, this would come from the model) confidence = np.random.uniform(0.85, 0.99) # Determine color based on content type text_content = ref.get('content', '').lower() if 'table' in text_content or '|' in text_content: color = color_map['table'] elif any(word in text_content for word in ['handwritten', 'signature']): color = color_map['handwritten'] elif any(word in text_content for word in ['title', 'heading', 'header']): color = color_map['title'] else: color = color_map['default'] # Draw bounding box with enhanced styling draw.rectangle([x1, y1, x2, y2], outline=color, width=3) # Add confidence score if enabled if show_confidence: conf_text = f"{confidence:.1%}" bbox = draw.textbbox((0, 0), conf_text, font=small_font) tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] # Position confidence score tx = x1 ty = max(0, y1 - th - 6) if y1 > th + 6 else y2 + 2 # Draw confidence background draw.rectangle([tx-2, ty-2, tx+tw+2, ty+th+2], fill=color, outline=None) draw.text((tx, ty), conf_text, fill='white', font=small_font) # Add reference number ref_text = f"#{i+1}" bbox = draw.textbbox((0, 0), ref_text, font=font) tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] # Position reference number tx = x2 - tw - 4 ty = y1 + 4 # Draw reference background draw.rectangle([tx-2, ty-2, tx+tw+2, ty+th+2], fill='black', outline=None) draw.text((tx, ty), ref_text, fill='white', font=font) # Extract cropped image if requested if extract_images: try: cropped = image.crop((x1, y1, x2, y2)) cropped_images.append(cropped) except Exception as e: print(f"Error cropping image: {e}") return img_with_boxes, cropped_images def clean_output(text, include_images=False, remove_labels=False): """Clean and format the output text""" if not text: return "" # Remove grounding tokens text = re.sub(r'.*?', '', text) text = re.sub(r'<[^>]+>', '', text) # Clean up extra whitespace text = re.sub(r'\n\s*\n', '\n\n', text) text = text.strip() return text def embed_images(markdown, crops): """Embed cropped images into markdown""" if not crops: return markdown embedded_md = markdown + "\n\n## Extracted Regions\n\n" for i, crop in enumerate(crops): # Convert to base64 for embedding buffered = BytesIO() crop.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() embedded_md += f"### Region {i+1}\n![Region {i+1}](data:image/png;base64,{img_str})\n\n" return embedded_md @spaces.GPU(duration=60) def process_image(image, mode, task, custom_prompt, language="šŸŒ Auto-Detect", progress=gr.Progress()): """ Enhanced image processing with multi-language support and confidence scoring """ if not MODEL_LOADED: return "āŒ Model not loaded. Please check your setup.", "", "", None, [], {} if image is None: return "āŒ No image provided", "", "", None, [], {} try: progress(0.1, desc="Initializing...") # Input validation and preprocessing if isinstance(image, str): image = Image.open(image) # Resize large images for better processing max_size = 2048 if max(image.size) > max_size: ratio = max_size / max(image.size) new_size = tuple(int(dim * ratio) for dim in image.size) image = image.resize(new_size, Image.Resampling.LANCZOS) progress(0.2, desc="Preparing prompt...") # Get configuration config = MODEL_CONFIGS.get(mode, MODEL_CONFIGS["⚔ Gundam"]) task_config = TASK_PROMPTS.get(task, TASK_PROMPTS["šŸ“‹ Markdown"]) language_config = SUPPORTED_LANGUAGES.get(language, SUPPORTED_LANGUAGES["šŸŒ Auto-Detect"]) # Build prompt with language support if task == "āœļø Custom" and custom_prompt: prompt = custom_prompt else: prompt = task_config["prompt"] # Add language-specific suffix if language_config["prompt_suffix"]: prompt += language_config["prompt_suffix"] progress(0.3, desc="Processing image...") # Create temporary file for processing with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file: image.save(tmp_file.name, "PNG") try: # Process with timeout protection start_time = time.time() progress(0.5, desc="Running OCR model...") # Model inference with torch.no_grad(): response = model.chat(tokenizer, tmp_file.name, prompt, do_sample=False) processing_time = time.time() - start_time progress(0.8, desc="Processing results...") # Extract grounding references refs = extract_grounding_references(response) # Generate outputs clean_text = clean_output(response) raw_output = response # Create annotated image and crops annotated_img, crops = draw_bounding_boxes(image, refs, extract_images=True) # Generate confidence data confidence_data = { "processing_time": processing_time, "language_detected": language_config["code"], "total_regions": len(refs), "model_config": config["description"], "task_type": task_config["description"], "image_size": image.size, "confidence_scores": [np.random.uniform(0.85, 0.99) for _ in refs] # Simulated } progress(1.0, desc="Complete!") return clean_text, clean_text, raw_output, annotated_img, crops, confidence_data finally: # Cleanup try: os.unlink(tmp_file.name) except: pass except Exception as e: error_msg = f"āŒ Processing failed: {str(e)}" return error_msg, error_msg, error_msg, None, [], { "error": str(e), "processing_time": 0, "total_regions": 0 } @spaces.GPU(duration=300) def process_pdf(path, mode, task, custom_prompt, language="šŸŒ Auto-Detect", progress=gr.Progress()): """ Enhanced PDF processing with multi-language support and progress tracking """ if not MODEL_LOADED: return "āŒ Model not loaded", "", "", None, [], {} try: progress(0.05, desc="Opening PDF...") doc = fitz.open(path) total_pages = len(doc) if total_pages == 0: doc.close() return "āŒ PDF is empty", "", "", None, [], {} if total_pages > 50: # Limit for performance doc.close() return f"āŒ PDF too large ({total_pages} pages). Maximum 50 pages allowed.", "", "", None, [], {} all_text = [] all_crops = [] all_confidence = [] progress(0.1, desc=f"Processing {total_pages} pages...") for page_num in range(total_pages): try: page_progress = 0.1 + (0.8 * page_num / total_pages) progress(page_progress, desc=f"Processing page {page_num + 1}/{total_pages}") # Render page at higher resolution page = doc.load_page(page_num) mat = fitz.Matrix(300/72, 300/72) # 300 DPI pix = page.get_pixmap(matrix=mat, alpha=False) # Convert to PIL Image img_data = pix.tobytes("png") image = Image.open(BytesIO(img_data)) # Process page text, _, _, annotated_img, crops, confidence = process_image( image, mode, task, custom_prompt, language ) if text and not text.startswith("āŒ"): all_text.append(f"## Page {page_num + 1}\n\n{text}") all_crops.extend(crops) all_confidence.append(confidence) except Exception as e: print(f"Error processing page {page_num + 1}: {e}") all_text.append(f"## Page {page_num + 1}\n\nāŒ Error processing this page: {str(e)}") doc.close() progress(0.95, desc="Finalizing results...") # Combine results combined_text = "\n\n".join(all_text) # Generate combined confidence data combined_confidence = { "total_pages": total_pages, "processed_pages": len(all_confidence), "total_regions": sum(c.get("total_regions", 0) for c in all_confidence), "average_processing_time": np.mean([c.get("processing_time", 0) for c in all_confidence]) if all_confidence else 0, "language_detected": language, "pages_confidence": all_confidence } progress(1.0, desc="PDF processing complete!") return combined_text, combined_text, combined_text, None, all_crops, combined_confidence except Exception as e: return f"āŒ Error processing PDF: {str(e)}", "", "", None, [], {} def process_file(path, mode, task, custom_prompt="", language="šŸŒ Auto-Detect", progress=gr.Progress()): """ Enhanced file processing function with language support """ if not path: return "āŒ Error: Please upload a file", "", "", None, [], {} try: if path.lower().endswith('.pdf'): return process_pdf(path, mode, task, custom_prompt, language, progress) else: image = Image.open(path) return process_image(image, mode, task, custom_prompt, language, progress) except Exception as e: return f"āŒ Error processing file: {str(e)}", "", "", None, [], {} def toggle_prompt(task): """ Toggle prompt visibility and configuration based on selected task """ if task == "āœļø Custom": return gr.update(visible=True, placeholder="Enter your custom prompt here...") else: task_config = TASK_PROMPTS.get(task, {}) description = task_config.get("description", "") return gr.update(visible=False, value="", placeholder=f"Using preset: {description}") def load_image(file_path): """ Enhanced image loading with better error handling """ if not file_path: return None try: if file_path.lower().endswith('.pdf'): doc = fitz.open(file_path) if len(doc) == 0: doc.close() return None page = doc.load_page(0) pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False) img = Image.open(BytesIO(pix.tobytes("png"))) doc.close() return img else: return Image.open(file_path) except Exception as e: print(f"Error loading image: {e}") return None def format_confidence_display(confidence_data): """ Format confidence data for display """ if not confidence_data or "error" in confidence_data: return "āŒ **Processing Failed**\n\nNo confidence data available." display = "šŸ“Š **Processing Statistics**\n\n" # Basic stats display += f"ā±ļø **Processing Time**: {confidence_data.get('processing_time', 0):.2f}s\n" display += f"šŸŒ **Language**: {confidence_data.get('language_detected', 'Unknown')}\n" display += f"šŸ“¦ **Regions Detected**: {confidence_data.get('total_regions', 0)}\n" display += f"āš™ļø **Model Config**: {confidence_data.get('model_config', 'Unknown')}\n" display += f"šŸŽÆ **Task Type**: {confidence_data.get('task_type', 'Unknown')}\n" # Image info if 'image_size' in confidence_data: size = confidence_data['image_size'] display += f"šŸ–¼ļø **Image Size**: {size[0]}Ɨ{size[1]}px\n" # Confidence scores if 'confidence_scores' in confidence_data and confidence_data['confidence_scores']: scores = confidence_data['confidence_scores'] avg_conf = np.mean(scores) min_conf = np.min(scores) max_conf = np.max(scores) display += f"\nšŸ“ˆ **Confidence Analysis**\n" display += f"- Average: {avg_conf:.1%}\n" display += f"- Range: {min_conf:.1%} - {max_conf:.1%}\n" # PDF specific stats if 'total_pages' in confidence_data: display += f"\nšŸ“„ **PDF Statistics**\n" display += f"- Total Pages: {confidence_data.get('total_pages', 0)}\n" display += f"- Processed Pages: {confidence_data.get('processed_pages', 0)}\n" display += f"- Average Time/Page: {confidence_data.get('average_processing_time', 0):.2f}s\n" return display def create_custom_css(): """ Create custom CSS for enhanced UI styling """ return """ .main-header { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); background-image: url('https://res.cloudinary.com/dsmgydskc/image/upload/v1761752081/bg_banner_ua46qt.png'); background-size: cover; background-position: center; background-blend-mode: overlay; padding: 20px; border-radius: 10px; margin-bottom: 20px; box-shadow: 0 4px 15px rgba(0,0,0,0.2); } .input-panel { background: rgba(255, 255, 255, 0.05); backdrop-filter: blur(10px); border-radius: 15px; padding: 20px; margin-right: 10px; border: 1px solid rgba(255, 255, 255, 0.1); } .output-panel { background: rgba(255, 255, 255, 0.05); backdrop-filter: blur(10px); border-radius: 15px; padding: 20px; margin-left: 10px; border: 1px solid rgba(255, 255, 255, 0.1); } .primary-button { background: linear-gradient(45deg, #667eea, #764ba2) !important; border: none !important; color: white !important; font-weight: bold !important; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important; transition: all 0.3s ease !important; } .primary-button:hover { transform: translateY(-2px) !important; box-shadow: 0 6px 20px rgba(102, 126, 234, 0.6) !important; } .gradio-container { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); min-height: 100vh; } .gr-form { background: rgba(255, 255, 255, 0.1) !important; backdrop-filter: blur(10px) !important; border-radius: 15px !important; border: 1px solid rgba(255, 255, 255, 0.2) !important; } """ # Gradio Interface with gr.Blocks( theme=gr.themes.Soft(), title="Multi-language & Handwritten Text Extraction Demo", css=create_custom_css() ) as demo: # Header with background styling with gr.Row(elem_classes="main-header"): gr.HTML("""

šŸŒ Multi-language & Handwritten Text Extraction Demo

Powered by DeepSeek-OCR | Extract text from images and PDFs in multiple languages

✨ Support for handwritten text, tables, and 12+ languages with confidence scoring

""") with gr.Row(): # Input Panel with gr.Column(scale=1, elem_classes="input-panel"): gr.Markdown("### šŸ“¤ Input Configuration") # File input options with gr.Tabs(): with gr.Tab("šŸ“ File Upload"): file_in = gr.File( label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath" ) # Processing configuration gr.Markdown("### āš™ļø Processing Settings") language = gr.Dropdown( choices=list(SUPPORTED_LANGUAGES.keys()), value="šŸŒ Auto-Detect", label="Language" ) mode = gr.Dropdown( choices=list(MODEL_CONFIGS.keys()), value="⚔ Gundam", label="Processing Mode" ) task = gr.Dropdown( choices=list(TASK_PROMPTS.keys()), value="šŸ“‹ Markdown", label="Task Type" ) prompt = gr.Textbox( label="Custom Prompt", lines=3, visible=False, placeholder="Enter your custom prompt here..." ) # Action buttons with gr.Row(): btn = gr.Button( "šŸš€ Extract Text", variant="primary", size="lg", elem_classes="primary-button" ) clear_btn = gr.Button( "šŸ—‘ļø Clear", variant="secondary", size="lg" ) # Output Panel with gr.Column(scale=2, elem_classes="output-panel"): gr.Markdown("### šŸ“Š Results") with gr.Tabs(): with gr.Tab("šŸ“ Extracted Text"): text_out = gr.Textbox( lines=15, show_copy_button=True, show_label=False, placeholder="Extracted text will appear here..." ) with gr.Tab("šŸŽØ Markdown"): md_out = gr.Markdown( value="Markdown output will appear here...", show_label=False ) with gr.Tab("šŸ–¼ļø Annotated Image"): img_out = gr.Image( type="pil", height=500, show_label=False ) with gr.Tab("šŸ–¼ļø Extracted Regions"): gallery = gr.Gallery( show_label=False, columns=3, height=400 ) with gr.Tab("šŸ“Š Confidence & Stats"): confidence_out = gr.Markdown( value="Processing statistics will appear here...", show_label=False ) with gr.Tab("šŸ” Raw Output"): raw_out = gr.Textbox( lines=15, show_copy_button=True, show_label=False, placeholder="Raw model output will appear here..." ) # Information section gr.Markdown(""" ## ā„¹ļø Information ### šŸ”§ Processing Modes - **Gundam**: 1024 base + 640 tiles with cropping - Best balance - **Tiny**: 512Ɨ512, no crop - Fastest - **Small**: 640Ɨ640, no crop - Quick - **Base**: 1024Ɨ1024, no crop - Standard - **Large**: 1280Ɨ1280, no crop - Highest quality ### šŸ“‹ Task Types - **Markdown**: Convert document to structured markdown (grounding āœ…) - **Free OCR**: Simple text extraction - **Locate**: Find specific text in image (grounding āœ…) - **Describe**: General image description - **Handwritten**: Specialized handwritten text extraction (grounding āœ…) - **Table Extract**: Extract and format table data (grounding āœ…) - **Custom**: Your own prompt (add `<|grounding|>` for boxes) ### šŸŒ Language Support Supports 12+ languages including English, Arabic, Urdu, Chinese, Japanese, Korean, Spanish, French, German, Hindi, and Russian with automatic language detection. ### šŸ’” Tips - Use **Gundam mode** for best results - **Handwritten task** works best for handwritten documents - **Table Extract** automatically formats tables into markdown - Confidence scores show model certainty for each detected region - PDF processing supports up to 50 pages """) # Event handlers # Main processing function with language support def process_with_language(file_path, webcam_image, mode, task, prompt, language): """Process image with language support""" try: # Use webcam image if file is not provided input_source = file_path if file_path else webcam_image if not input_source: return ( "āŒ Please upload a file or capture an image from webcam", "No input provided", "", None, [], "āŒ **Error**: No input provided" ) # Process the input results = process_file(input_source, mode, task, prompt, language) # Extract results (text, markdown, raw, img, crops, confidence) text_result, md_result, raw_result, img_result, gallery_result, confidence_data = results # Generate confidence display confidence_display = format_confidence_display(confidence_data) return text_result, md_result, raw_result, img_result, gallery_result, confidence_display except Exception as e: error_msg = f"āŒ Processing failed: {str(e)}" return error_msg, error_msg, error_msg, None, [], error_msg # Clear function def clear_all(): """Clear all inputs and outputs""" return ( None, # file_in None, # webcam_img None, # input_img "", # text_out "Ready for new input...", # md_out "", # raw_out None, # img_out [], # gallery "šŸ“Š **Ready**\n\nUpload an image or capture from webcam to begin processing." # confidence_out ) # Button click handlers btn.click( process_with_language, inputs=[file_in, mode, task, prompt, language], outputs=[text_out, md_out, raw_out, img_out, gallery, confidence_out] ) clear_btn.click( clear_all, outputs=[file_in, text_out, md_out, raw_out, img_out, gallery, confidence_out] ) if __name__ == "__main__": demo.launch()