Spaces:

VanguardAI
/

Arabic-OCR

Sleeping

App Files Files Community

VanguardAI commited on 18 days ago

Commit

0961ac0

verified ·

1 Parent(s): 4a32a6e

Update app.py

Browse files

Files changed (1) hide show

app.py +225 -1398

app.py CHANGED Viewed

@@ -1,425 +1,126 @@
 import spaces
-import json
-import math
-import os
-import traceback
-from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple
-import re
-import fitz  # PyMuPDF
 import gradio as gr
-import requests
 import torch
-from huggingface_hub import snapshot_download
-from PIL import Image, ImageDraw, ImageFont
 from qwen_vl_utils import process_vision_info
-from transformers import AutoModelForCausalLM, AutoProcessor
-import numpy as np
-# Import Arabic text correction module
-from arabic_corrector import get_corrector
 # ========================================
-# DETERMINISTIC SETTINGS FOR CONSISTENCY
 # ========================================
-# Set seeds for reproducibility - ensures same image always gives same output
-torch.manual_seed(42)
-torch.cuda.manual_seed_all(42)
-np.random.seed(42)
-# Ensure deterministic behavior in PyTorch operations
-torch.backends.cudnn.deterministic = True
-torch.backends.cudnn.benchmark = False
-# Constants
-MIN_PIXELS = 3136
-MAX_PIXELS = 11289600
-IMAGE_FACTOR = 28
-# Prompts
-prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
-1. Bbox format: [x1, y1, x2, y2]
-2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
-3. Text Extraction & Formatting Rules:
-    - Picture: For the 'Picture' category, the text field should be omitted.
-    - Formula: Format its text as LaTeX.
-    - Table: Format its text as HTML.
-    - All Others (Text, Title, etc.): Format their text as Markdown.
-4. Constraints:
-    - The output text must be the original text from the image, with no translation.
-    - All layout elements must be sorted according to human reading order.
-5. Final Output: The entire output must be a single JSON object.
-"""
-# Utility functions
-def round_by_factor(number: int, factor: int) -> int:
-    """Returns the closest integer to 'number' that is divisible by 'factor'."""
-    return round(number / factor) * factor
-def smart_resize(
-    height: int,
-    width: int,
-    factor: int = 28,
-    min_pixels: int = 3136,
-    max_pixels: int = 11289600,
-):
-    """Rescales the image so that the following conditions are met:
-    1. Both dimensions (height and width) are divisible by 'factor'.
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-    3. The aspect ratio of the image is maintained as closely as possible.
-    """
-    if max(height, width) / min(height, width) > 200:
-        raise ValueError(
-            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
-        )
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = round_by_factor(height / beta, factor)
-        w_bar = round_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = round_by_factor(height * beta, factor)
-        w_bar = round_by_factor(width * beta, factor)
-    return h_bar, w_bar
-def fetch_image(image_input, min_pixels: int = None, max_pixels: int = None):
-    """Fetch and process an image"""
-    if isinstance(image_input, str):
-        if image_input.startswith(("http://", "https://")):
-            response = requests.get(image_input)
-            image = Image.open(BytesIO(response.content)).convert('RGB')
-        else:
-            image = Image.open(image_input).convert('RGB')
-    elif isinstance(image_input, Image.Image):
-        image = image_input.convert('RGB')
-    else:
-        raise ValueError(f"Invalid image input type: {type(image_input)}")
-    if min_pixels is not None or max_pixels is not None:
-        min_pixels = min_pixels or MIN_PIXELS
-        max_pixels = max_pixels or MAX_PIXELS
-        height, width = smart_resize(
-            image.height,
-            image.width,
-            factor=IMAGE_FACTOR,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels
-        )
-        image = image.resize((width, height), Image.LANCZOS)
-    return image
-def load_images_from_pdf(pdf_path: str) -> List[Image.Image]:
-    """Load images from PDF file"""
-    images = []
-    try:
-        pdf_document = fitz.open(pdf_path)
-        for page_num in range(len(pdf_document)):
-            page = pdf_document.load_page(page_num)
-            # Convert page to image
-            mat = fitz.Matrix(2.0, 2.0)  # Increase resolution
-            pix = page.get_pixmap(matrix=mat)
-            img_data = pix.tobytes("ppm")
-            image = Image.open(BytesIO(img_data)).convert('RGB')
-            images.append(image)
-        pdf_document.close()
-    except Exception as e:
-        print(f"Error loading PDF: {e}")
-        return []
-    return images
-def draw_layout_on_image(image: Image.Image, layout_data: List[Dict]) -> Image.Image:
-    """Draw layout bounding boxes on image"""
-    img_copy = image.copy()
-    draw = ImageDraw.Draw(img_copy)
-    # Colors for different categories
-    colors = {
-        'Caption': '#FF6B6B',
-        'Footnote': '#4ECDC4',
-        'Formula': '#45B7D1',
-        'List-item': '#96CEB4',
-        'Page-footer': '#FFEAA7',
-        'Page-header': '#DDA0DD',
-        'Picture': '#FFD93D',
-        'Section-header': '#6C5CE7',
-        'Table': '#FD79A8',
-        'Text': '#74B9FF',
-        'Title': '#E17055'
-    }
     try:
-        # Load a font
-        try:
-            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 12)
-        except Exception:
-            font = ImageFont.load_default()
-        for item in layout_data:
-            if 'bbox' in item and 'category' in item:
-                bbox = item['bbox']
-                category = item['category']
-                color = colors.get(category, '#000000')
-                # Draw rectangle
-                draw.rectangle(bbox, outline=color, width=2)
-                # Draw label
-                label = category
-                label_bbox = draw.textbbox((0, 0), label, font=font)
-                label_width = label_bbox[2] - label_bbox[0]
-                label_height = label_bbox[3] - label_bbox[1]
-                # Position label above the box
-                label_x = bbox[0]
-                label_y = max(0, bbox[1] - label_height - 2)
-                # Draw background for label
-                draw.rectangle(
-                    [label_x, label_y, label_x + label_width + 4, label_y + label_height + 2],
-                    fill=color
-                )
-                # Draw text
-                draw.text((label_x + 2, label_y + 1), label, fill='white', font=font)
-    except Exception as e:
-        print(f"Error drawing layout: {e}")
-    return img_copy
-def is_arabic_text(text: str) -> bool:
-    """Check if text in headers and paragraphs contains mostly Arabic characters"""
-    if not text:
-        return False
-    # Extract text from headers and paragraphs only
-    # Match markdown headers (# ## ###) and regular paragraph text
-    header_pattern = r'^#{1,6}\s+(.+)$'
-    paragraph_pattern = r'^(?!#{1,6}\s|!\[|```|\||\s*[-*+]\s|\s*\d+\.\s)(.+)$'
-    content_text = []
-    for line in text.split('\n'):
-        line = line.strip()
-        if not line:
-            continue
-        # Check for headers
-        header_match = re.match(header_pattern, line, re.MULTILINE)
-        if header_match:
-            content_text.append(header_match.group(1))
-            continue
-        # Check for paragraph text (exclude lists, tables, code blocks, images)
-        if re.match(paragraph_pattern, line, re.MULTILINE):
-            content_text.append(line)
-    if not content_text:
-        return False
-    # Join all content text and check for Arabic characters
-    combined_text = ' '.join(content_text)
-    # Arabic Unicode ranges
-    arabic_chars = 0
-    total_chars = 0
-    for char in combined_text:
-        if char.isalpha():
-            total_chars += 1
-            # Arabic script ranges
-            if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') or ('\u08A0' <= char <= '\u08FF'):
-                arabic_chars += 1
-    if total_chars == 0:
-        return False
-    # Consider text as Arabic if more than 50% of alphabetic characters are Arabic
-    return (arabic_chars / total_chars) > 0.5
-def layoutjson2md(image: Image.Image, layout_data: List[Dict], text_key: str = 'text') -> str:
-    """Convert layout JSON to markdown format"""
-    import base64
-    from io import BytesIO
-    markdown_lines = []
-    try:
-        # Sort items by reading order (top to bottom, left to right)
-        sorted_items = sorted(layout_data, key=lambda x: (x.get('bbox', [0, 0, 0, 0])[1], x.get('bbox', [0, 0, 0, 0])[0]))
-        for item in sorted_items:
-            category = item.get('category', '')
-            text = item.get(text_key, '')
-            bbox = item.get('bbox', [])
-            if category == 'Picture':
-                # Extract image region and embed it
-                if bbox and len(bbox) == 4:
-                    try:
-                        # Extract the image region
-                        x1, y1, x2, y2 = bbox
-                        # Ensure coordinates are within image bounds
-                        x1, y1 = max(0, int(x1)), max(0, int(y1))
-                        x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
-                        if x2 > x1 and y2 > y1:
-                            cropped_img = image.crop((x1, y1, x2, y2))
-                            # Convert to base64 for embedding
-                            buffer = BytesIO()
-                            cropped_img.save(buffer, format='PNG')
-                            img_data = base64.b64encode(buffer.getvalue()).decode()
-                            # Add as markdown image
-                            markdown_lines.append(f"![Image](data:image/png;base64,{img_data})\n")
-                        else:
-                            markdown_lines.append("![Image](Image region detected)\n")
-                    except Exception as e:
-                        print(f"Error processing image region: {e}")
-                        markdown_lines.append("![Image](Image detected)\n")
-                else:
-                    markdown_lines.append("![Image](Image detected)\n")
-            elif not text:
-                continue
-            elif category == 'Title':
-                markdown_lines.append(f"# {text}\n")
-            elif category == 'Section-header':
-                markdown_lines.append(f"## {text}\n")
-            elif category == 'Text':
-                markdown_lines.append(f"{text}\n")
-            elif category == 'List-item':
-                markdown_lines.append(f"- {text}\n")
-            elif category == 'Table':
-                # If text is already HTML, keep it as is
-                if text.strip().startswith('<'):
-                    markdown_lines.append(f"{text}\n")
-                else:
-                    markdown_lines.append(f"**Table:** {text}\n")
-            elif category == 'Formula':
-                # If text is LaTeX, format it properly
-                if text.strip().startswith('$') or '\\' in text:
-                    markdown_lines.append(f"$$\n{text}\n$$\n")
-                else:
-                    markdown_lines.append(f"**Formula:** {text}\n")
-            elif category == 'Caption':
-                markdown_lines.append(f"*{text}*\n")
-            elif category == 'Footnote':
-                markdown_lines.append(f"^{text}^\n")
-            elif category in ['Page-header', 'Page-footer']:
-                # Skip headers and footers in main content
-                continue
-            else:
-                markdown_lines.append(f"{text}\n")
-            markdown_lines.append("")  # Add spacing
     except Exception as e:
-        print(f"Error converting to markdown: {e}")
-        return str(layout_data)
-    return "\n".join(markdown_lines)
-# Initialize model/processor lazily inside GPU context
-model_id = "rednote-hilab/dots.ocr"
-model_path = "./models/dots-ocr-local"
-model = None
-processor = None
-def ensure_model_loaded():
-    """Lazily download and load model/processor using eager attention (no FlashAttention)."""
-    global model, processor
-    if model is not None and processor is not None:
-        return
-    # Always use eager attention
-    attn_impl = "eager"
-    # Use GPU if available, otherwise CPU
-    if torch.cuda.is_available():
-        dtype = torch.bfloat16  # Use bfloat16 on GPU for consistency
-        device_map = "auto"
-    else:
-        dtype = torch.float32
-        device_map = "cpu"
-    # Download snapshot locally (idempotent)
-    snapshot_download(
-        repo_id=model_id,
-        local_dir=model_path,
-        local_dir_use_symlinks=False,
-    )
-    # Load model/processor
-    loaded_model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        attn_implementation=attn_impl,
-        torch_dtype=dtype,
-        device_map=device_map,
-        trust_remote_code=True,
-        low_cpu_mem_usage=True,
-    )
-    loaded_processor = AutoProcessor.from_pretrained(
-        model_path,
-        trust_remote_code=True,
-    )
-    model = loaded_model
-    processor = loaded_processor
-# Global state variables
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# PDF handling state
-pdf_cache = {
-    "images": [],
-    "current_page": 0,
-    "total_pages": 0,
-    "file_type": None,
-    "is_parsed": False,
-    "results": []
-}
 @spaces.GPU()
-def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> str:
-    """Run inference on an image with the given prompt"""
     try:
         ensure_model_loaded()
         if model is None or processor is None:
-            raise RuntimeError("Model not loaded. Please check model initialization.")
-        # Prepare messages in the expected format
         messages = [
             {
                 "role": "user",
                 "content": [
                     {
                         "type": "image",
-                        "image": image
                     },
-                    {"type": "text", "text": prompt}
-                ]
             }
         ]
         # Apply chat template
         text = processor.apply_chat_template(
-            messages,
-            tokenize=False,
             add_generation_prompt=True
         )
@@ -435,22 +136,16 @@ def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> s
             return_tensors="pt",
         )
-        # Move to the model's primary device (works with device_map as well)
-        primary_device = next(model.parameters()).device
-        inputs = inputs.to(primary_device)
-        # Generate output - DETERMINISTIC MODE
-        # Set seed for complete reproducibility
-        torch.manual_seed(42)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(42)
         with torch.no_grad():
             generated_ids = model.generate(
-                **inputs,
                 max_new_tokens=max_new_tokens,
-                do_sample=False,  # Greedy decoding for deterministic output
-                # Remove temperature/top_p/top_k when do_sample=False for consistency
             )
         # Decode output
@@ -459,1095 +154,226 @@ def inference(image: Image.Image, prompt: str, max_new_tokens: int = 24000) -> s
         ]
         output_text = processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
             clean_up_tokenization_spaces=False
         )
-        return output_text[0] if output_text else ""
-    except Exception as e:
-        print(f"Error during inference: {e}")
-        traceback.print_exc()
-        return f"Error during inference: {str(e)}"
-@spaces.GPU()
-def _generate_text_and_confidence_for_crop(
-    image: Image.Image,
-    max_new_tokens: int = 128,
-) -> Tuple[str, float]:
-    """Generate text for a cropped region and compute average per-token confidence from model scores.
-    Returns (generated_text, average_confidence_percent).
-    """
-    try:
-        ensure_model_loaded()
-        # Prepare a concise extraction prompt for the crop
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {
-                        "type": "text",
-                        "text": "Extract the exact text content from this image region. Output text only without translation or additional words.",
-                    },
-                ],
-            }
-        ]
-        # Apply chat template
-        text = processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        # Process vision information
-        image_inputs, video_inputs = process_vision_info(messages)
-        # Prepare inputs
-        inputs = processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        primary_device = next(model.parameters()).device
-        inputs = inputs.to(primary_device)
-        # Set seed for deterministic output
-        torch.manual_seed(42)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(42)
-        # Generate with scores - DETERMINISTIC MODE
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                do_sample=False,  # Greedy decoding for deterministic output
-                output_scores=True,
-                return_dict_in_generate=True,
-            )
-        sequences = outputs.sequences  # [batch, seq_len]
-        input_len = inputs.input_ids.shape[1]
-        # Trim input prompt ids to isolate generated tokens
-        generated_ids = sequences[:, input_len:]
-        generated_text = processor.batch_decode(
-            generated_ids,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )[0].strip()
-        # Compute average probability of chosen tokens
-        confidences: List[float] = []
-        for step, step_scores in enumerate(outputs.scores or []):
-            # step_scores: [batch, vocab]
-            probs = torch.nn.functional.softmax(step_scores, dim=-1)
-            # token id chosen at this step
-            if input_len + step < sequences.shape[1]:
-                chosen_ids = sequences[:, input_len + step].unsqueeze(-1)
-                chosen_probs = probs.gather(dim=-1, index=chosen_ids)  # [batch, 1]
-                confidences.append(float(chosen_probs[0, 0].item()))
-        avg_conf_percent = (sum(confidences) / len(confidences) * 100.0) if confidences else 0.0
-        return generated_text, avg_conf_percent
-    except Exception as e:
-        print(f"Error generating crop confidence: {e}")
-        traceback.print_exc()
-        return "", 0.0
-def estimate_text_density(image: Image.Image) -> float:
-    """
-    Estimate text density in image using pixel analysis.
-    Returns value between 0.0 (no text) and 1.0 (very dense text).
-    """
-    try:
-        # Convert to grayscale
-        img_gray = image.convert('L')
-        img_array = np.array(img_gray)
-        # Apply Otsu's thresholding to isolate text-like regions
-        # Text regions are typically darker than background
-        threshold = np.mean(img_array) * 0.7  # Adaptive threshold
-        text_mask = img_array < threshold
-        # Calculate text density
-        text_pixels = np.sum(text_mask)
-        total_pixels = img_array.size
-        density = text_pixels / total_pixels
-        return min(density, 1.0)
-    except Exception as e:
-        print(f"Warning: Could not estimate text density: {e}")
-        return 0.1  # Default to low density
-def should_chunk_image(image: Image.Image) -> Tuple[bool, str]:
-    """
-    Intelligently determine if image should be chunked for better accuracy.
-    Returns (should_chunk, reason).
-    """
-    width, height = image.size
-    total_pixels = width * height
-    density = estimate_text_density(image)
-    # Criteria for chunking (prioritizing ACCURACY)
-    # 1. Very large images (>8MP) - model struggles with layout detection
-    if total_pixels > 8_000_000:
-        return True, f"Large image ({total_pixels/1_000_000:.1f}MP) - chunking for better layout detection"
-    # 2. Dense text (>25% coverage) in large image - overwhelming for single pass
-    if density > 0.25 and total_pixels > 4_000_000:
-        return True, f"Dense text ({density*100:.1f}% coverage) in large image - chunking for accuracy"
-    # 3. Very dense text (>40%) regardless of size - likely tables/forms
-    if density > 0.40:
-        return True, f"Very dense text ({density*100:.1f}% coverage) - likely structured document, chunking"
-    # 4. Extreme aspect ratio - likely scrolled document
-    aspect_ratio = max(width, height) / min(width, height)
-    if aspect_ratio > 3.0 and total_pixels > 3_000_000:
-        return True, f"Extreme aspect ratio ({aspect_ratio:.1f}) - chunking vertically"
-    return False, "Image size and density within optimal range"
-def chunk_image_intelligently(image: Image.Image) -> List[Dict[str, Any]]:
-    """
-    Chunk image into optimal pieces for processing.
-    Uses overlap to prevent text cutting and smart sizing for accuracy.
-    Returns list of chunks with metadata.
-    """
-    width, height = image.size
-    # Determine optimal chunk size based on density and dimensions
-    density = estimate_text_density(image)
-    if density > 0.40:
-        # Very dense - use smaller chunks for better accuracy
-        chunk_size = 1600
-    elif density > 0.25:
-        # Moderate density
-        chunk_size = 2048
-    else:
-        # Lower density - can use larger chunks
-        chunk_size = 2800
-    overlap = 150  # Generous overlap to prevent text cutting
-    chunks = []
-    chunk_id = 0
-    # Calculate grid
-    y_positions = list(range(0, height, chunk_size - overlap))
-    if y_positions[-1] + chunk_size < height:
-        y_positions.append(height - chunk_size)
-    x_positions = list(range(0, width, chunk_size - overlap))
-    if x_positions[-1] + chunk_size < width:
-        x_positions.append(width - chunk_size)
-    for y in y_positions:
-        for x in x_positions:
-            x1, y1 = max(0, x), max(0, y)
-            x2 = min(x1 + chunk_size, width)
-            y2 = min(y1 + chunk_size, height)
-            # Skip if chunk is too small (overlap region)
-            if (x2 - x1) < chunk_size // 2 or (y2 - y1) < chunk_size // 2:
-                continue
-            chunk_img = image.crop((x1, y1, x2, y2))
-            chunks.append({
-                'id': chunk_id,
-                'image': chunk_img,
-                'offset': (x1, y1),
-                'bbox': (x1, y1, x2, y2),
-                'size': (x2 - x1, y2 - y1)
-            })
-            chunk_id += 1
-    print(f"📐 Chunked into {len(chunks)} pieces (chunk_size={chunk_size}, overlap={overlap})")
-    return chunks
-def merge_chunk_results(chunk_results: List[Dict[str, Any]], original_size: Tuple[int, int]) -> Dict[str, Any]:
-    """
-    Intelligently merge results from multiple chunks.
-    Handles overlapping regions and deduplication.
-    """
-    merged_layout = []
-    seen_regions = set()
-    for chunk_result in chunk_results:
-        offset_x, offset_y = chunk_result['offset']
-        for item in chunk_result.get('layout_result', []):
-            bbox = item.get('bbox', [])
-            if not bbox or len(bbox) != 4:
-                continue
-            # Adjust bbox to original image coordinates
-            adjusted_bbox = [
-                bbox[0] + offset_x,
-                bbox[1] + offset_y,
-                bbox[2] + offset_x,
-                bbox[3] + offset_y
-            ]
-            # Simple deduplication: check if similar region already exists
-            region_key = (
-                adjusted_bbox[0] // 50,  # Grid-based dedup (50px tolerance)
-                adjusted_bbox[1] // 50,
-                adjusted_bbox[2] // 50,
-                adjusted_bbox[3] // 50,
-                item.get('category', 'Text')
-            )
-            if region_key in seen_regions:
-                continue
-            seen_regions.add(region_key)
-            # Create merged item
-            merged_item = item.copy()
-            merged_item['bbox'] = adjusted_bbox
-            merged_layout.append(merged_item)
-    # Sort by reading order (top to bottom, left to right)
-    merged_layout.sort(key=lambda x: (x.get('bbox', [0, 0])[1], x.get('bbox', [0, 0])[0]))
-    # Create merged result
-    merged_result = {
-        'layout_result': merged_layout,
-        'is_merged': True,
-        'num_chunks': len(chunk_results)
-    }
-    return merged_result
-def process_image(
-    image: Image.Image,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None,
-    max_new_tokens: int = 24000,
-) -> Dict[str, Any]:
-    """
-    Process a single image with intelligent chunking for accuracy.
-    Automatically detects dense/large images and chunks them for better results.
-    """
-    try:
-        original_image = image.copy()
-        original_size = image.size
-        # Resize image if needed
-        if min_pixels is not None or max_pixels is not None:
-            image = fetch_image(image, min_pixels=min_pixels, max_pixels=max_pixels)
-        # 🎯 INTELLIGENT CHUNKING: Check if image needs chunking for better accuracy
-        needs_chunking, reason = should_chunk_image(image)
-        if needs_chunking:
-            print(f"🔄 {reason}")
-            print(f"   Processing in chunks for maximum accuracy...")
-            # Chunk the image
-            chunks = chunk_image_intelligently(image)
-            # Process each chunk
-            chunk_results = []
-            for i, chunk_data in enumerate(chunks):
-                print(f"   Processing chunk {i+1}/{len(chunks)}...")
-                chunk_img = chunk_data['image']
-                # Process this chunk with full quality
-                chunk_output = inference(chunk_img, prompt, max_new_tokens=max_new_tokens)
-                try:
-                    chunk_layout = json.loads(chunk_output)
-                    chunk_results.append({
-                        'layout_result': chunk_layout,
-                        'offset': chunk_data['offset'],
-                        'bbox': chunk_data['bbox']
-                    })
-                except json.JSONDecodeError:
-                    print(f"   ⚠️ Chunk {i+1} failed to parse, skipping")
-                    continue
-            # Merge chunk results intelligently
-            if chunk_results:
-                merged = merge_chunk_results(chunk_results, original_size)
-                layout_data = merged['layout_result']
-                raw_output = json.dumps(layout_data, ensure_ascii=False)
-                print(f"✅ Merged {len(chunk_results)} chunks into {len(layout_data)} regions")
-            else:
-                print(f"⚠️ All chunks failed, falling back to single-pass")
-                raw_output = inference(image, prompt, max_new_tokens=max_new_tokens)
-        else:
-            print(f"✅ {reason} - processing in single pass")
-            # Standard single-pass processing
-            raw_output = inference(image, prompt, max_new_tokens=max_new_tokens)
-        # Process results based on prompt mode
-        result = {
-            'original_image': image,
-            'raw_output': raw_output,
-            'processed_image': image,
-            'layout_result': None,
-            'markdown_content': None
-        }
-        # Try to parse JSON and create visualizations (since we're doing layout analysis)
-        try:
-            # Try to parse JSON output
-            layout_data = json.loads(raw_output)
-            # 🎯 INTELLIGENT CONFIDENCE SCORING
-            # Count text regions to determine if per-region scoring is feasible
-            num_text_regions = sum(1 for item in layout_data
-                                  if item.get('text') and item.get('category') not in ['Picture'])
-            # For dense documents (>15 regions), skip expensive per-region scoring
-            # This prioritizes speed on dense images while maintaining OCR accuracy
-            if num_text_regions <= 15:
-                print(f"📊 Computing per-region confidence for {num_text_regions} regions...")
-                # Compute per-region confidence using the model on each cropped region
-                for idx, item in enumerate(layout_data):
-                    try:
-                        bbox = item.get('bbox', [])
-                        text_content = item.get('text', '')
-                        category = item.get('category', '')
-                        if (not text_content) or category == 'Picture' or not bbox or len(bbox) != 4:
-                            continue
-                        x1, y1, x2, y2 = bbox
-                        x1, y1 = max(0, int(x1)), max(0, int(y1))
-                        x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
-                        if x2 <= x1 or y2 <= y1:
-                            continue
-                        crop_img = image.crop((x1, y1, x2, y2))
-                        # Generate and score text for this crop; we only keep the confidence
-                        _, region_conf = _generate_text_and_confidence_for_crop(crop_img)
-                        item['confidence'] = region_conf
-                    except Exception as e:
-                        print(f"Error scoring region {idx}: {e}")
-                        # Leave confidence absent if scoring fails
-            else:
-                print(f"⚡ Skipping per-region confidence scoring ({num_text_regions} regions - using fast mode)")
-                print(f"   OCR accuracy maintained, confidence estimated from model output")
-                # Assign reasonable default confidence based on successful parsing
-                for item in layout_data:
-                    if item.get('text') and item.get('category') not in ['Picture']:
-                        item['confidence'] = 87.5  # Reasonable estimate for successful OCR
-            result['layout_result'] = layout_data
-            # Create visualization with bounding boxes
-            try:
-                processed_image = draw_layout_on_image(image, layout_data)
-                result['processed_image'] = processed_image
-            except Exception as e:
-                print(f"Error drawing layout: {e}")
-                result['processed_image'] = image
-            # Generate markdown from layout data
-            try:
-                markdown_content = layoutjson2md(image, layout_data, text_key='text')
-                result['markdown_content'] = markdown_content
-            except Exception as e:
-                print(f"Error generating markdown: {e}")
-                result['markdown_content'] = raw_output
-            # ✨ ARABIC TEXT CORRECTION: Apply intelligent correction to each text region
-            try:
-                print("🔧 Applying Arabic text correction...")
-                corrector = get_corrector()
-                for idx, item in enumerate(layout_data):
-                    text_content = item.get('text', '')
-                    category = item.get('category', '')
-                    # Only correct text regions (skip pictures, formulas, etc.)
-                    if not text_content or category in ['Picture', 'Formula', 'Table']:
-                        continue
-                    # Apply correction
-                    correction_result = corrector.correct_text(text_content)
-                    # Store both original and corrected versions
-                    item['text_original'] = text_content
-                    item['text_corrected'] = correction_result['corrected']
-                    item['correction_confidence'] = correction_result['overall_confidence']
-                    item['corrections_made'] = correction_result['corrections_made']
-                    item['word_corrections'] = correction_result['words']
-                    # Update the text field to use corrected version
-                    item['text'] = correction_result['corrected']
-                # Regenerate markdown with corrected text
-                corrected_markdown = layoutjson2md(image, layout_data, text_key='text')
-                result['markdown_content_corrected'] = corrected_markdown
-                result['markdown_content_original'] = markdown_content
-                print(f"✅ Correction complete")
-            except Exception as e:
-                print(f"⚠️ Error during Arabic correction: {e}")
-                traceback.print_exc()
-                # Fallback: keep original text
-                result['markdown_content_corrected'] = markdown_content
-                result['markdown_content_original'] = markdown_content
-        except json.JSONDecodeError:
-            print("Failed to parse JSON output, using raw output")
-            result['markdown_content'] = raw_output
-            result['markdown_content_original'] = raw_output
-            result['markdown_content_corrected'] = raw_output
-        return result
     except Exception as e:
-        print(f"Error processing image: {e}")
         traceback.print_exc()
-        return {
-            'original_image': image,
-            'raw_output': f"Error processing image: {str(e)}",
-            'processed_image': image,
-            'layout_result': None,
-            'markdown_content': f"Error processing image: {str(e)}"
-        }
-def load_file_for_preview(file_path: str) -> Tuple[Optional[Image.Image], str]:
-    """Load file for preview (supports PDF and images)"""
-    global pdf_cache
-    if not file_path or not os.path.exists(file_path):
-        return None, "No file selected"
-    file_ext = os.path.splitext(file_path)[1].lower()
-    try:
-        if file_ext == '.pdf':
-            # Load PDF pages
-            images = load_images_from_pdf(file_path)
-            if not images:
-                return None, "Failed to load PDF"
-            pdf_cache.update({
-                "images": images,
-                "current_page": 0,
-                "total_pages": len(images),
-                "file_type": "pdf",
-                "is_parsed": False,
-                "results": []
-            })
-            return images[0], f"Page 1 / {len(images)}"
-        elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
-            # Load single image
-            image = Image.open(file_path).convert('RGB')
-            pdf_cache.update({
-                "images": [image],
-                "current_page": 0,
-                "total_pages": 1,
-                "file_type": "image",
-                "is_parsed": False,
-                "results": []
-            })
-            return image, "Page 1 / 1"
-        else:
-            return None, f"Unsupported file format: {file_ext}"
-    except Exception as e:
-        print(f"Error loading file: {e}")
-        return None, f"Error loading file: {str(e)}"
-def turn_page(direction: str) -> Tuple[Optional[Image.Image], str, List, Any, Optional[Image.Image], Optional[Dict]]:
-    """Navigate through PDF pages and update all relevant outputs."""
-    global pdf_cache
-    if not pdf_cache["images"]:
-        return None, '<div class="page-info">No file loaded</div>', [], "No results yet", None, None
-    if direction == "prev":
-        pdf_cache["current_page"] = max(0, pdf_cache["current_page"] - 1)
-    elif direction == "next":
-        pdf_cache["current_page"] = min(
-            pdf_cache["total_pages"] - 1,
-            pdf_cache["current_page"] + 1
-        )
-    index = pdf_cache["current_page"]
-    current_image_preview = pdf_cache["images"][index]
-    page_info_html = f'<div class="page-info">Page {index + 1} / {pdf_cache["total_pages"]}</div>'
-    # Initialize default result values
-    markdown_content = "Page not processed yet"
-    processed_img = None
-    layout_json = None
-    ocr_table_data = []
-    # Get results for current page if available
-    if (pdf_cache["is_parsed"] and
-        index < len(pdf_cache["results"]) and
-        pdf_cache["results"][index]):
-        result = pdf_cache["results"][index]
-        markdown_content = result.get('markdown_content') or result.get('raw_output', 'No content available')
-        processed_img = result.get('processed_image', None) # Get the processed image
-        layout_json = result.get('layout_result', None) # Get the layout JSON
-        # Generate OCR table for current page
-        if layout_json and result.get('original_image'):
-            # Need to import the helper here or move it outside
-            import base64
-            from io import BytesIO
-            for idx, item in enumerate(layout_json):
-                bbox = item.get('bbox', [])
-                text = item.get('text', '')
-                category = item.get('category', '')
-                if not text or category == 'Picture':
-                    continue
-                img_html = ""
-                if bbox and len(bbox) == 4:
-                    try:
-                        x1, y1, x2, y2 = bbox
-                        orig_img = result['original_image']
-                        x1, y1 = max(0, int(x1)), max(0, int(y1))
-                        x2, y2 = min(orig_img.width, int(x2)), min(orig_img.height, int(y2))
-                        if x2 > x1 and y2 > y1:
-                            cropped_img = orig_img.crop((x1, y1, x2, y2))
-                            buffer = BytesIO()
-                            cropped_img.save(buffer, format='PNG')
-                            img_data = base64.b64encode(buffer.getvalue()).decode()
-                            img_html = f'<img src="data:image/png;base64,{img_data}" style="max-width:200px; max-height:100px; object-fit:contain;" />'
-                    except Exception as e:
-                        print(f"Error cropping region {idx}: {e}")
-                        img_html = f"<div>Region {idx+1}</div>"
-                else:
-                    img_html = f"<div>Region {idx+1}</div>"
-                # Extract confidence from item if available, otherwise N/A
-                confidence = item.get('confidence', 'N/A')
-                if isinstance(confidence, (int, float)):
-                    confidence = f"{confidence:.1f}%"
-                elif confidence != 'N/A':
-                    confidence = str(confidence)
-                ocr_table_data.append([img_html, text, confidence])
-    # Check for Arabic text to set RTL property
-    if is_arabic_text(markdown_content):
-        markdown_update = gr.update(value=markdown_content, rtl=True)
-    else:
-        markdown_update = markdown_content
-    return current_image_preview, page_info_html, ocr_table_data, markdown_update, processed_img, layout_json
 def create_gradio_interface():
-    """Create the Gradio interface"""
     # Custom CSS
     css = """
     .main-container {
-        max-width: 1400px;
         margin: 0 auto;
     }
     .header-text {
         text-align: center;
         color: #2c3e50;
-        margin-bottom: 20px;
     }
     .process-button {
         border: none !important;
         color: white !important;
         font-weight: bold !important;
     }
     .process-button:hover {
         transform: translateY(-2px) !important;
-        box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important;
-    }
-    .info-box {
-        border: 1px solid #dee2e6;
-        border-radius: 8px;
-        padding: 15px;
-        margin: 10px 0;
-    }
-    .page-info {
-        text-align: center;
-        padding: 8px 16px;
-        border-radius: 20px;
-        font-weight: bold;
-        margin: 10px 0;
     }
-    .model-status {
-        padding: 10px;
         border-radius: 8px;
-        margin: 10px 0;
-        text-align: center;
-        font-weight: bold;
-    }
-    .status-ready {
-        background: #d1edff;
-        color: #0c5460;
-        border: 1px solid #b8daff;
-    }
-    /* Arabic Correction Styling */
-    .original-text-box {
-        background: #fff5f5 !important;
-        border: 2px solid #fc8181 !important;
-        border-radius: 8px;
-        padding: 15px;
         min-height: 300px;
-        direction: rtl;
     }
-    .corrected-text-box {
-        background: #f0fff4 !important;
-        border: 2px solid #68d391 !important;
-        border-radius: 8px;
         padding: 15px;
-        min-height: 300px;
-        direction: rtl;
-    }
-    .correction-high {
-        background: #c6f6d5;
-        padding: 2px 4px;
-        border-radius: 3px;
-    }
-    .correction-medium {
-        background: #fef5e7;
-        padding: 2px 4px;
-        border-radius: 3px;
-    }
-    .correction-low {
-        background: #ffe0e0;
-        padding: 2px 4px;
-        border-radius: 3px;
     }
     """
-    with gr.Blocks(theme=gr.themes.Soft(), css=css, title="Arabic OCR - Document Text Extraction") as demo:
         # Header
         gr.HTML("""
-        <div class="title" style="text-align: center">
-            <h1>🔍 Arabic OCR - Professional Document Text Extraction</h1>
-            <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
-                Advanced AI-powered OCR solution for Arabic documents with high accuracy layout detection and text extraction
             </p>
         </div>
         """)
         # Main interface
         with gr.Row():
-            # Left column - Input and controls
             with gr.Column(scale=1):
-                # File input
-                file_input = gr.File(
-                    label="Upload Image or PDF",
-                    file_types=[".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".pdf"],
-                    type="filepath"
-                )
-                # Image preview
-                image_preview = gr.Image(
-                    label="Preview",
                     type="pil",
-                    interactive=False,
-                    height=300
                 )
-                # Page navigation for PDFs
-                with gr.Row():
-                    prev_page_btn = gr.Button("◀ Previous", size="md")
-                    page_info = gr.HTML('<div class="page-info">No file loaded</div>')
-                    next_page_btn = gr.Button("Next ▶", size="md")
                 # Advanced settings
-                with gr.Accordion("Advanced Settings", open=False):
-                    max_new_tokens = gr.Slider(
-                        minimum=1000,
-                        maximum=32000,
-                        value=24000,
-                        step=1000,
-                        label="Max New Tokens",
-                        info="Maximum number of tokens to generate"
                     )
-                    min_pixels = gr.Number(
-                        value=MIN_PIXELS,
-                        label="Min Pixels",
-                        info="Minimum image resolution"
                     )
-                    max_pixels = gr.Number(
-                        value=MAX_PIXELS,
-                        label="Max Pixels",
-                        info="Maximum image resolution"
-                    )
                 # Process button
                 process_btn = gr.Button(
-                    "🚀 Process Document",
                     variant="primary",
                     elem_classes=["process-button"],
                     size="lg"
                 )
                 # Clear button
-                clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
-            # Right column - Results
-            with gr.Column(scale=2):
-                # Results tabs
-                with gr.Tabs():
-                    # Processed image tab
-                    with gr.Tab("🖼️ Processed Image"):
-                        processed_image = gr.Image(
-                            label="Image with Layout Detection",
-                            type="pil",
-                            interactive=False,
-                            height=500
-                        )
-                    # ✨ NEW: Arabic Text Correction Comparison Tab
-                    with gr.Tab("✨ Corrected Text (AI)"):
-                        gr.Markdown("""
-                        ### 🔧 AI-Powered Arabic Text Correction
-                        This tab shows **Original OCR** vs **AI-Corrected** text side-by-side.
-                        Corrections use dictionary matching, context analysis, and linguistic intelligence.
-                        """)
-                        with gr.Row():
-                            with gr.Column():
-                                gr.Markdown("#### 📄 Original OCR Output")
-                                original_text_output = gr.Markdown(
-                                    value="Original text will appear here...",
-                                    elem_classes=["original-text-box"]
-                                )
-                            with gr.Column():
-                                gr.Markdown("#### ✅ Corrected Text")
-                                corrected_text_output = gr.Markdown(
-                                    value="Corrected text will appear here...",
-                                    elem_classes=["corrected-text-box"]
-                                )
-                        correction_stats = gr.Markdown(value="")
-                    # Editable OCR Results Table
-                    with gr.Tab("📊 OCR Results Table"):
-                        gr.Markdown("### Editable OCR Results\nReview and edit the extracted text for each detected region")
-                        ocr_table = gr.Dataframe(
-                            headers=["Region Image", "Extracted Text", "Confidence"],
-                            datatype=["html", "str", "str"],
-                            label="OCR Results",
-                            interactive=True,
-                            wrap=True
-                        )
-                    # Markdown output tab
-                    with gr.Tab("📝 Extracted Content"):
-                        markdown_output = gr.Markdown(
-                            value="Click 'Process Document' to see extracted content...",
-                            height=500
-                        )
-                    # JSON layout tab
-                    with gr.Tab("📋 Layout JSON"):
-                        json_output = gr.JSON(
-                            label="Layout Analysis Results",
-                            value=None
-                        )
-        # Helper function to create OCR table
-        def create_ocr_table(image: Image.Image, layout_data: List[Dict]) -> List[List[str]]:
-            """Create table data from layout results with cropped images"""
-            import base64
-            from io import BytesIO
-            if not layout_data:
-                return []
-            table_data = []
-            for idx, item in enumerate(layout_data):
-                bbox = item.get('bbox', [])
-                text = item.get('text', '')
-                category = item.get('category', '')
-                # Skip items without text or Picture category
-                if not text or category == 'Picture':
-                    continue
-                # Crop the image region
-                img_html = ""
-                if bbox and len(bbox) == 4:
-                    try:
-                        x1, y1, x2, y2 = bbox
-                        # Ensure coordinates are within image bounds
-                        x1, y1 = max(0, int(x1)), max(0, int(y1))
-                        x2, y2 = min(image.width, int(x2)), min(image.height, int(y2))
-                        if x2 > x1 and y2 > y1:
-                            cropped_img = image.crop((x1, y1, x2, y2))
-                            # Convert to base64 for HTML display
-                            buffer = BytesIO()
-                            cropped_img.save(buffer, format='PNG')
-                            img_data = base64.b64encode(buffer.getvalue()).decode()
-                            # Create HTML img tag
-                            img_html = f'<img src="data:image/png;base64,{img_data}" style="max-width:200px; max-height:100px; object-fit:contain;" />'
-                    except Exception as e:
-                        print(f"Error cropping region {idx}: {e}")
-                        img_html = f"<div>Region {idx+1}</div>"
-                else:
-                    img_html = f"<div>Region {idx+1}</div>"
-                # Add confidence score - extract from item if available, otherwise N/A
-                confidence = item.get('confidence', 'N/A')
-                if isinstance(confidence, (int, float)):
-                    confidence = f"{confidence:.1f}%"
-                elif confidence != 'N/A':
-                    confidence = str(confidence)
-                # Add row to table
-                table_data.append([img_html, text, confidence])
-            return table_data
         # Event handlers
-        @spaces.GPU()
-        def process_document(file_path, max_tokens, min_pix, max_pix):
-            """Process the uploaded document"""
-            global pdf_cache
             try:
-                # Ensure model/processor are loaded within GPU context
-                ensure_model_loaded()
-                if not file_path:
-                    return None, [], "Please upload a file first.", None
-                if model is None:
-                    return None, [], "Model not loaded. Please refresh the page and try again.", None
-                # Load and preview file
-                image, page_info = load_file_for_preview(file_path)
-                if image is None:
-                    return None, [], page_info, None
-                # Process the image(s)
-                if pdf_cache["file_type"] == "pdf":
-                    # Process all pages for PDF
-                    all_results = []
-                    all_markdown = []
-                    for i, img in enumerate(pdf_cache["images"]):
-                        result = process_image(
-                            img,
-                            min_pixels=int(min_pix) if min_pix else None,
-                            max_pixels=int(max_pix) if max_pix else None,
-                            max_new_tokens=int(max_tokens) if max_tokens else 24000,
-                        )
-                        all_results.append(result)
-                        if result.get('markdown_content'):
-                            all_markdown.append(f"## Page {i+1}\n\n{result['markdown_content']}")
-                    pdf_cache["results"] = all_results
-                    pdf_cache["is_parsed"] = True
-                    # Show results for first page
-                    first_result = all_results[0]
-                    combined_markdown = "\n\n---\n\n".join(all_markdown)
-                    # Check if the combined markdown contains mostly Arabic text
-                    if is_arabic_text(combined_markdown):
-                        markdown_update = gr.update(value=combined_markdown, rtl=True)
-                    else:
-                        markdown_update = combined_markdown
-                    # Create OCR table for first page
-                    ocr_table_data = []
-                    if first_result['layout_result']:
-                        ocr_table_data = create_ocr_table(
-                            first_result['original_image'],
-                            first_result['layout_result']
-                        )
-                    # Prepare correction comparison
-                    original_text = first_result.get('markdown_content_original', first_result.get('markdown_content', ''))
-                    corrected_text = first_result.get('markdown_content_corrected', first_result.get('markdown_content', ''))
-                    # Calculate correction statistics
-                    total_corrections = 0
-                    if first_result.get('layout_result'):
-                        for item in first_result['layout_result']:
-                            total_corrections += item.get('corrections_made', 0)
-                    stats_text = f"### 📊 Correction Statistics\n- **Corrections Made**: {total_corrections}\n- **Method**: Dictionary + Context Analysis"
-                    return (
-                        first_result['processed_image'],
-                        original_text if is_arabic_text(original_text) else gr.update(value=original_text, rtl=False),
-                        corrected_text if is_arabic_text(corrected_text) else gr.update(value=corrected_text, rtl=False),
-                        stats_text,
-                        ocr_table_data,
-                        markdown_update,
-                        first_result['layout_result']
-                    )
-                else:
-                    # Process single image
-                    result = process_image(
-                        image,
-                        min_pixels=int(min_pix) if min_pix else None,
-                        max_pixels=int(max_pix) if max_pix else None,
-                        max_new_tokens=int(max_tokens) if max_tokens else 24000,
-                    )
-                    pdf_cache["results"] = [result]
-                    pdf_cache["is_parsed"] = True
-                    # Check if the content contains mostly Arabic text
-                    content = result['markdown_content'] or "No content extracted"
-                    if is_arabic_text(content):
-                        markdown_update = gr.update(value=content, rtl=True)
-                    else:
-                        markdown_update = content
-                    # Create OCR table
-                    ocr_table_data = []
-                    if result['layout_result']:
-                        ocr_table_data = create_ocr_table(
-                            result['original_image'],
-                            result['layout_result']
-                        )
-                    # Prepare correction comparison
-                    original_text = result.get('markdown_content_original', result.get('markdown_content', ''))
-                    corrected_text = result.get('markdown_content_corrected', result.get('markdown_content', ''))
-                    # Calculate correction statistics
-                    total_corrections = 0
-                    if result.get('layout_result'):
-                        for item in result['layout_result']:
-                            total_corrections += item.get('corrections_made', 0)
-                    stats_text = f"### 📊 Correction Statistics\n- **Corrections Made**: {total_corrections}\n- **Method**: Dictionary + Context Analysis"
-                    return (
-                        result['processed_image'],
-                        original_text if is_arabic_text(original_text) else gr.update(value=original_text, rtl=False),
-                        corrected_text if is_arabic_text(corrected_text) else gr.update(value=corrected_text, rtl=False),
-                        stats_text,
-                        ocr_table_data,
-                        markdown_update,
-                        result['layout_result']
-                    )
             except Exception as e:
-                error_msg = f"Error processing document: {str(e)}"
-                print(error_msg)
-                traceback.print_exc()
-                return None, "Error", "Error", "Error occurred", [], error_msg, None
-        def handle_file_upload(file_path):
-            """Handle file upload and show preview"""
-            if not file_path:
-                return None, "No file loaded"
-            image, page_info = load_file_for_preview(file_path)
-            return image, page_info
-        def handle_page_turn(direction):
-            """Handle page navigation"""
-            image, page_info, result = turn_page(direction)
-            return image, page_info, result
-        def clear_all():
-            """Clear all data and reset interface"""
-            global pdf_cache
-            pdf_cache = {
-                "images": [], "current_page": 0, "total_pages": 0,
-                "file_type": None, "is_parsed": False, "results": []
-            }
-            return (
-                None,  # file_input
-                None,  # image_preview
-                '<div class="page-info">No file loaded</div>',  # page_info
-                None,  # processed_image
-                "Original text will appear here...",  # original_text_output
-                "Corrected text will appear here...",  # corrected_text_output
-                "",  # correction_stats
-                [],  # ocr_table
-                "Click 'Process Document' to see extracted content...",  # markdown_output
-                None,  # json_output
-            )
-        # Wire up event handlers
-        file_input.change(
-            handle_file_upload,
-            inputs=[file_input],
-            outputs=[image_preview, page_info]
-        )
-        # The outputs list is now updated to include all components that need to change
-        prev_page_btn.click(
-            lambda: turn_page("prev"),
-            outputs=[image_preview, page_info, ocr_table, markdown_output, processed_image, json_output]
-        )
-        next_page_btn.click(
-            lambda: turn_page("next"),
-            outputs=[image_preview, page_info, ocr_table, markdown_output, processed_image, json_output]
-        )
         process_btn.click(
-            process_document,
-            inputs=[file_input, max_new_tokens, min_pixels, max_pixels],
-            outputs=[processed_image, original_text_output, corrected_text_output, correction_stats, ocr_table, markdown_output, json_output]
         )
-        # The outputs list for the clear button is now correct
         clear_btn.click(
-            clear_all,
-            outputs=[
-                file_input, image_preview, page_info, processed_image,
-                original_text_output, corrected_text_output, correction_stats,
-                ocr_table, markdown_output, json_output
-            ]
         )
     return demo
@@ -1563,3 +389,4 @@ if __name__ == "__main__":
         debug=True,
         show_error=True
     )

 import spaces
 import gradio as gr
 import torch
+from PIL import Image
 from qwen_vl_utils import process_vision_info
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+import traceback
 # ========================================
+# AIN VLM MODEL FOR OCR
 # ========================================
+# Model configuration
+MODEL_ID = "MBZUAI/AIN"
+# Global model and processor
+model = None
+processor = None
+# Strict OCR-focused prompt
+OCR_PROMPT = """Extract all text from this image exactly as it appears.
+Requirements:
+1. Extract ONLY the text content - do not describe, analyze, or interpret the image
+2. Maintain the original text structure, layout, and formatting
+3. Preserve line breaks, paragraphs, and spacing as they appear
+4. Do not translate the text - keep it in its original language
+5. Do not add any explanations, descriptions, or additional commentary
+6. If there are tables, maintain their structure
+7. If there are headers, titles, or sections, preserve their hierarchy
+Output only the extracted text, nothing else."""
+def ensure_model_loaded():
+    """Lazily load the AIN VLM model and processor."""
+    global model, processor
+    if model is not None and processor is not None:
+        return
+    print("🔄 Loading AIN VLM model...")
     try:
+        # Determine device and dtype
+        if torch.cuda.is_available():
+            device_map = "auto"
+            torch_dtype = "auto"
+            print("✅ Using GPU (CUDA)")
+        else:
+            device_map = "cpu"
+            torch_dtype = torch.float32
+            print("✅ Using CPU")
+        # Load model
+        loaded_model = Qwen2VLForConditionalGeneration.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch_dtype,
+            device_map=device_map,
+            trust_remote_code=True,
+        )
+        # Load processor
+        loaded_processor = AutoProcessor.from_pretrained(
+            MODEL_ID,
+            trust_remote_code=True,
+        )
+        model = loaded_model
+        processor = loaded_processor
+        print("✅ Model loaded successfully!")
     except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        traceback.print_exc()
+        raise
 @spaces.GPU()
+def extract_text_from_image(image: Image.Image, custom_prompt: str = None, max_new_tokens: int = 2048) -> str:
+    """
+    Extract text from image using AIN VLM model.
+    Args:
+        image: PIL Image to process
+        custom_prompt: Optional custom prompt (uses default OCR prompt if None)
+        max_new_tokens: Maximum tokens to generate
+    Returns:
+        Extracted text as string
+    """
     try:
+        # Ensure model is loaded
         ensure_model_loaded()
         if model is None or processor is None:
+            return "❌ Error: Model not loaded. Please refresh and try again."
+        # Use custom prompt or default OCR prompt
+        prompt_to_use = custom_prompt if custom_prompt and custom_prompt.strip() else OCR_PROMPT
+        # Prepare messages in the format expected by the model
         messages = [
             {
                 "role": "user",
                 "content": [
                     {
                         "type": "image",
+                        "image": image,
                     },
+                    {
+                        "type": "text",
+                        "text": prompt_to_use
+                    },
+                ],
             }
         ]
         # Apply chat template
         text = processor.apply_chat_template(
+            messages,
+            tokenize=False,
             add_generation_prompt=True
         )
             return_tensors="pt",
         )
+        # Move to device
+        device = next(model.parameters()).device
+        inputs = inputs.to(device)
+        # Generate output
         with torch.no_grad():
             generated_ids = model.generate(
+                **inputs,
                 max_new_tokens=max_new_tokens,
+                do_sample=False,  # Greedy decoding for consistency
             )
         # Decode output
         ]
         output_text = processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
             clean_up_tokenization_spaces=False
         )
+        result = output_text[0] if output_text else ""
+        return result.strip() if result else "No text extracted"
     except Exception as e:
+        error_msg = f"❌ Error during text extraction: {str(e)}"
+        print(error_msg)
         traceback.print_exc()
+        return error_msg
 def create_gradio_interface():
+    """Create the Gradio interface for AIN OCR."""
     # Custom CSS
     css = """
     .main-container {
+        max-width: 1200px;
         margin: 0 auto;
     }
     .header-text {
         text-align: center;
         color: #2c3e50;
+        margin-bottom: 30px;
     }
     .process-button {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
         border: none !important;
         color: white !important;
         font-weight: bold !important;
+        font-size: 1.1em !important;
+        padding: 12px 24px !important;
     }
     .process-button:hover {
         transform: translateY(-2px) !important;
+        box-shadow: 0 6px 12px rgba(0,0,0,0.2) !important;
     }
+    .output-text {
+        background: #f8f9fa;
+        border: 2px solid #dee2e6;
         border-radius: 8px;
+        padding: 20px;
         min-height: 300px;
+        font-family: 'Courier New', monospace;
+        white-space: pre-wrap;
+        direction: auto;
     }
+    .info-box {
+        background: #e3f2fd;
+        border-left: 4px solid #2196f3;
         padding: 15px;
+        margin: 10px 0;
+        border-radius: 4px;
     }
     """
+    with gr.Blocks(theme=gr.themes.Soft(), css=css, title="AIN VLM OCR") as demo:
         # Header
         gr.HTML("""
+        <div class="header-text">
+            <h1>🔍 AIN VLM - Vision Language Model OCR</h1>
+            <p style="font-size: 1.1em; color: #6b7280; margin-top: 10px;">
+                Advanced OCR using Vision Language Model (VLM) for accurate text extraction
             </p>
+            <p style="font-size: 0.95em; color: #9ca3af; margin-top: 8px;">
+                Powered by <strong>MBZUAI/AIN</strong> - Specialized for understanding and extracting text from images
+            </p>
+        </div>
+        """)
+        # Info box
+        gr.Markdown("""
+        <div class="info-box">
+        <strong>ℹ️ How it works:</strong> Upload an image containing text, click "Process Image", and get the extracted text.
+        The VLM model intelligently understands context and can handle handwritten text better than traditional OCR models.
         </div>
         """)
         # Main interface
         with gr.Row():
+            # Left column - Input
             with gr.Column(scale=1):
+                # Image input
+                image_input = gr.Image(
+                    label="📸 Upload Image",
                     type="pil",
+                    height=400
                 )
                 # Advanced settings
+                with gr.Accordion("⚙️ Advanced Settings", open=False):
+                    custom_prompt = gr.Textbox(
+                        label="Custom Prompt (Optional)",
+                        placeholder="Leave empty to use default OCR prompt...",
+                        lines=4,
+                        info="Customize the prompt if you want specific extraction behavior"
                     )
+                    max_tokens = gr.Slider(
+                        minimum=512,
+                        maximum=4096,
+                        value=2048,
+                        step=128,
+                        label="Max Tokens",
+                        info="Maximum length of extracted text"
                     )
+                    show_prompt_btn = gr.Button("👁️ Show Default Prompt", size="sm")
                 # Process button
                 process_btn = gr.Button(
+                    "🚀 Process Image",
                     variant="primary",
                     elem_classes=["process-button"],
                     size="lg"
                 )
                 # Clear button
+                clear_btn = gr.Button("🗑️ Clear All", variant="secondary", size="lg")
+            # Right column - Output
+            with gr.Column(scale=1):
+                # Text output
+                text_output = gr.Textbox(
+                    label="📝 Extracted Text",
+                    placeholder="Extracted text will appear here...",
+                    lines=20,
+                    max_lines=25,
+                    show_copy_button=True,
+                    interactive=False,
+                    elem_classes=["output-text"]
+                )
+                # Status/info
+                status_output = gr.Markdown(
+                    value="*Ready to process images*",
+                    elem_classes=["info-box"]
+                )
+        # Examples
+        gr.Markdown("### 📚 Example Images")
+        gr.Examples(
+            examples=[
+                ["image/app/1762329983969.png"],
+                ["image/app/1762330009302.png"],
+                ["image/app/1762330020168.png"],
+            ],
+            inputs=image_input,
+            label="Try these examples"
+        )
+        # Default prompt display
+        default_prompt_display = gr.Textbox(
+            label="Default OCR Prompt",
+            value=OCR_PROMPT,
+            lines=10,
+            visible=False,
+            interactive=False
+        )
         # Event handlers
+        def process_image_handler(image, custom_prompt_text, max_tokens_value):
+            """Handle image processing."""
+            if image is None:
+                return "", "⚠️ Please upload an image first."
             try:
+                status = "⏳ Processing image..."
+                extracted_text = extract_text_from_image(
+                    image,
+                    custom_prompt=custom_prompt_text,
+                    max_new_tokens=int(max_tokens_value)
+                )
+                if extracted_text and not extracted_text.startswith("❌"):
+                    status = f"✅ Text extracted successfully! ({len(extracted_text)} characters)"
+                else:
+                    status = "⚠️ No text extracted or error occurred."
+                return extracted_text, status
             except Exception as e:
+                error_msg = f"❌ Error: {str(e)}"
+                return error_msg, "❌ Processing failed."
+        def clear_all_handler():
+            """Clear all inputs and outputs."""
+            return None, "", "", "✨ Ready to process images"
+        def toggle_prompt_display(current_visible):
+            """Toggle the visibility of the default prompt."""
+            return gr.update(visible=not current_visible)
+        # Wire up events
         process_btn.click(
+            process_image_handler,
+            inputs=[image_input, custom_prompt, max_tokens],
+            outputs=[text_output, status_output]
         )
         clear_btn.click(
+            clear_all_handler,
+            outputs=[image_input, text_output, custom_prompt, status_output]
+        )
+        # Show/hide default prompt
+        show_prompt_btn.click(
+            lambda: gr.update(visible=True),
+            outputs=[default_prompt_display]
         )
     return demo
         debug=True,
         show_error=True
     )