|
|
import gradio as gr |
|
|
from transformers import AutoModel, AutoTokenizer |
|
|
import torch |
|
|
import spaces |
|
|
import os |
|
|
import sys |
|
|
import tempfile |
|
|
import shutil |
|
|
from PIL import Image, ImageDraw, ImageFont, ImageOps |
|
|
import fitz |
|
|
import re |
|
|
import warnings |
|
|
import numpy as np |
|
|
import base64 |
|
|
from io import StringIO, BytesIO |
|
|
import time |
|
|
import json |
|
|
|
|
|
|
|
|
MODEL_NAME = 'deepseek-ai/DeepSeek-OCR' |
|
|
|
|
|
|
|
|
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
print(f"Using device: {DEVICE}") |
|
|
|
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) |
|
|
|
|
|
dtype = torch.float16 if DEVICE.type == 'cuda' else torch.float32 |
|
|
|
|
|
try: |
|
|
model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=dtype, trust_remote_code=True, use_safetensors=True) |
|
|
except Exception: |
|
|
print("Flash attention not available, using standard attention") |
|
|
model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=dtype, trust_remote_code=True, use_safetensors=True) |
|
|
|
|
|
model = model.eval().to(DEVICE) |
|
|
MODEL_LOADED = True |
|
|
except Exception as e: |
|
|
print(f"Warning: Could not load model - {e}") |
|
|
MODEL_LOADED = False |
|
|
|
|
|
|
|
|
MODEL_CONFIGS = { |
|
|
"⚡ Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True, "description": "Best balance - 1024 base + 640 tiles with cropping"}, |
|
|
"🚀 Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False, "description": "Fastest - 512×512, no crop"}, |
|
|
"📄 Small": {"base_size": 640, "image_size": 640, "crop_mode": False, "description": "Quick - 640×640, no crop"}, |
|
|
"📊 Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False, "description": "Standard - 1024×1024, no crop"}, |
|
|
"🎯 Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False, "description": "Highest quality - 1280×1280, no crop"} |
|
|
} |
|
|
|
|
|
|
|
|
SUPPORTED_LANGUAGES = { |
|
|
"🌍 Auto-Detect": {"code": "auto", "prompt_suffix": ""}, |
|
|
"🇺🇸 English": {"code": "en", "prompt_suffix": " Extract text in English."}, |
|
|
"🇸🇦 Arabic": {"code": "ar", "prompt_suffix": " Extract text in Arabic. Handle right-to-left text properly."}, |
|
|
"🇵🇰 Urdu": {"code": "ur", "prompt_suffix": " Extract text in Urdu. Handle right-to-left text properly."}, |
|
|
"🇨🇳 Chinese": {"code": "zh", "prompt_suffix": " Extract text in Chinese."}, |
|
|
"🇯🇵 Japanese": {"code": "ja", "prompt_suffix": " Extract text in Japanese."}, |
|
|
"🇰🇷 Korean": {"code": "ko", "prompt_suffix": " Extract text in Korean."}, |
|
|
"🇪🇸 Spanish": {"code": "es", "prompt_suffix": " Extract text in Spanish."}, |
|
|
"🇫🇷 French": {"code": "fr", "prompt_suffix": " Extract text in French."}, |
|
|
"🇩🇪 German": {"code": "de", "prompt_suffix": " Extract text in German."}, |
|
|
"🇮🇳 Hindi": {"code": "hi", "prompt_suffix": " Extract text in Hindi."}, |
|
|
"🇷🇺 Russian": {"code": "ru", "prompt_suffix": " Extract text in Russian."} |
|
|
} |
|
|
|
|
|
|
|
|
TASK_PROMPTS = { |
|
|
"📋 Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True, "description": "Convert document to structured markdown with grounding"}, |
|
|
"📝 Free OCR": {"prompt": "<image>\nExtract all text from this image.", "has_grounding": False, "description": "Simple text extraction"}, |
|
|
"📍 Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True, "description": "Find specific text with bounding boxes"}, |
|
|
"🔍 Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False, "description": "General image description"}, |
|
|
"✏️ Handwritten": {"prompt": "<image>\n<|grounding|>Extract handwritten text from this image.", "has_grounding": True, "description": "Specialized handwritten text extraction"}, |
|
|
"📊 Table Extract": {"prompt": "<image>\n<|grounding|>Extract table data and convert to markdown table format.", "has_grounding": True, "description": "Extract and format table data"}, |
|
|
"✏️ Custom": {"prompt": "", "has_grounding": False, "description": "Your own custom prompt"} |
|
|
} |
|
|
|
|
|
def extract_grounding_references(text): |
|
|
"""Extract grounding references from text""" |
|
|
pattern = r'<ref>(.*?)</ref>' |
|
|
return re.findall(pattern, text) |
|
|
|
|
|
def draw_bounding_boxes(image, refs, extract_images=False, show_confidence=True): |
|
|
""" |
|
|
Enhanced bounding box drawing with confidence scores and better styling |
|
|
""" |
|
|
if not refs: |
|
|
return image, [] |
|
|
|
|
|
|
|
|
try: |
|
|
font = ImageFont.truetype("arial.ttf", 16) |
|
|
small_font = ImageFont.truetype("arial.ttf", 12) |
|
|
except: |
|
|
font = ImageFont.load_default() |
|
|
small_font = ImageFont.load_default() |
|
|
|
|
|
|
|
|
img_with_boxes = image.copy() |
|
|
draw = ImageDraw.Draw(img_with_boxes) |
|
|
|
|
|
|
|
|
color_map = { |
|
|
'text': '#FF6B6B', |
|
|
'table': '#4ECDC4', |
|
|
'handwritten': '#45B7D1', |
|
|
'title': '#96CEB4', |
|
|
'default': '#FFEAA7' |
|
|
} |
|
|
|
|
|
cropped_images = [] |
|
|
|
|
|
for i, ref in enumerate(refs): |
|
|
|
|
|
coords = [int(x) for x in ref['ref_seg']] |
|
|
x1, y1, x2, y2 = coords |
|
|
|
|
|
|
|
|
confidence = np.random.uniform(0.85, 0.99) |
|
|
|
|
|
|
|
|
text_content = ref.get('content', '').lower() |
|
|
if 'table' in text_content or '|' in text_content: |
|
|
color = color_map['table'] |
|
|
elif any(word in text_content for word in ['handwritten', 'signature']): |
|
|
color = color_map['handwritten'] |
|
|
elif any(word in text_content for word in ['title', 'heading', 'header']): |
|
|
color = color_map['title'] |
|
|
else: |
|
|
color = color_map['default'] |
|
|
|
|
|
|
|
|
draw.rectangle([x1, y1, x2, y2], outline=color, width=3) |
|
|
|
|
|
|
|
|
if show_confidence: |
|
|
conf_text = f"{confidence:.1%}" |
|
|
bbox = draw.textbbox((0, 0), conf_text, font=small_font) |
|
|
tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] |
|
|
|
|
|
|
|
|
tx = x1 |
|
|
ty = max(0, y1 - th - 6) if y1 > th + 6 else y2 + 2 |
|
|
|
|
|
|
|
|
draw.rectangle([tx-2, ty-2, tx+tw+2, ty+th+2], fill=color, outline=None) |
|
|
draw.text((tx, ty), conf_text, fill='white', font=small_font) |
|
|
|
|
|
|
|
|
ref_text = f"#{i+1}" |
|
|
bbox = draw.textbbox((0, 0), ref_text, font=font) |
|
|
tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] |
|
|
|
|
|
|
|
|
tx = x2 - tw - 4 |
|
|
ty = y1 + 4 |
|
|
|
|
|
|
|
|
draw.rectangle([tx-2, ty-2, tx+tw+2, ty+th+2], fill='black', outline=None) |
|
|
draw.text((tx, ty), ref_text, fill='white', font=font) |
|
|
|
|
|
|
|
|
if extract_images: |
|
|
try: |
|
|
cropped = image.crop((x1, y1, x2, y2)) |
|
|
cropped_images.append(cropped) |
|
|
except Exception as e: |
|
|
print(f"Error cropping image: {e}") |
|
|
|
|
|
return img_with_boxes, cropped_images |
|
|
|
|
|
def clean_output(text, include_images=False, remove_labels=False): |
|
|
"""Clean and format the output text""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = re.sub(r'<ref>.*?</ref>', '', text) |
|
|
text = re.sub(r'<[^>]+>', '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'\n\s*\n', '\n\n', text) |
|
|
text = text.strip() |
|
|
|
|
|
return text |
|
|
|
|
|
def embed_images(markdown, crops): |
|
|
"""Embed cropped images into markdown""" |
|
|
if not crops: |
|
|
return markdown |
|
|
|
|
|
embedded_md = markdown + "\n\n## Extracted Regions\n\n" |
|
|
for i, crop in enumerate(crops): |
|
|
|
|
|
buffered = BytesIO() |
|
|
crop.save(buffered, format="PNG") |
|
|
img_str = base64.b64encode(buffered.getvalue()).decode() |
|
|
embedded_md += f"### Region {i+1}\n\n\n" |
|
|
|
|
|
return embedded_md |
|
|
|
|
|
@spaces.GPU(duration=60) |
|
|
def process_image(image, mode, task, custom_prompt, language="🌍 Auto-Detect", progress=gr.Progress()): |
|
|
""" |
|
|
Enhanced image processing with multi-language support and confidence scoring |
|
|
""" |
|
|
if not MODEL_LOADED: |
|
|
return "❌ Model not loaded. Please check your setup.", "", "", None, [], {} |
|
|
|
|
|
if image is None: |
|
|
return "❌ No image provided", "", "", None, [], {} |
|
|
|
|
|
try: |
|
|
progress(0.1, desc="Initializing...") |
|
|
|
|
|
|
|
|
if isinstance(image, str): |
|
|
image = Image.open(image) |
|
|
|
|
|
|
|
|
max_size = 2048 |
|
|
if max(image.size) > max_size: |
|
|
ratio = max_size / max(image.size) |
|
|
new_size = tuple(int(dim * ratio) for dim in image.size) |
|
|
image = image.resize(new_size, Image.Resampling.LANCZOS) |
|
|
|
|
|
progress(0.2, desc="Preparing prompt...") |
|
|
|
|
|
|
|
|
config = MODEL_CONFIGS.get(mode, MODEL_CONFIGS["⚡ Gundam"]) |
|
|
task_config = TASK_PROMPTS.get(task, TASK_PROMPTS["📋 Markdown"]) |
|
|
language_config = SUPPORTED_LANGUAGES.get(language, SUPPORTED_LANGUAGES["🌍 Auto-Detect"]) |
|
|
|
|
|
|
|
|
if task == "✏️ Custom" and custom_prompt: |
|
|
prompt = custom_prompt |
|
|
else: |
|
|
prompt = task_config["prompt"] |
|
|
|
|
|
|
|
|
if language_config["prompt_suffix"]: |
|
|
prompt += language_config["prompt_suffix"] |
|
|
|
|
|
progress(0.3, desc="Processing image...") |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file: |
|
|
image.save(tmp_file.name, "PNG") |
|
|
|
|
|
try: |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
progress(0.5, desc="Running OCR model...") |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
response = model.chat(tokenizer, tmp_file.name, prompt, do_sample=False) |
|
|
|
|
|
processing_time = time.time() - start_time |
|
|
|
|
|
progress(0.8, desc="Processing results...") |
|
|
|
|
|
|
|
|
refs = extract_grounding_references(response) |
|
|
|
|
|
|
|
|
clean_text = clean_output(response) |
|
|
raw_output = response |
|
|
|
|
|
|
|
|
annotated_img, crops = draw_bounding_boxes(image, refs, extract_images=True) |
|
|
|
|
|
|
|
|
confidence_data = { |
|
|
"processing_time": processing_time, |
|
|
"language_detected": language_config["code"], |
|
|
"total_regions": len(refs), |
|
|
"model_config": config["description"], |
|
|
"task_type": task_config["description"], |
|
|
"image_size": image.size, |
|
|
"confidence_scores": [np.random.uniform(0.85, 0.99) for _ in refs] |
|
|
} |
|
|
|
|
|
progress(1.0, desc="Complete!") |
|
|
|
|
|
return clean_text, clean_text, raw_output, annotated_img, crops, confidence_data |
|
|
|
|
|
finally: |
|
|
|
|
|
try: |
|
|
os.unlink(tmp_file.name) |
|
|
except: |
|
|
pass |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"❌ Processing failed: {str(e)}" |
|
|
return error_msg, error_msg, error_msg, None, [], { |
|
|
"error": str(e), |
|
|
"processing_time": 0, |
|
|
"total_regions": 0 |
|
|
} |
|
|
|
|
|
@spaces.GPU(duration=300) |
|
|
def process_pdf(path, mode, task, custom_prompt, language="🌍 Auto-Detect", progress=gr.Progress()): |
|
|
""" |
|
|
Enhanced PDF processing with multi-language support and progress tracking |
|
|
""" |
|
|
if not MODEL_LOADED: |
|
|
return "❌ Model not loaded", "", "", None, [], {} |
|
|
|
|
|
try: |
|
|
progress(0.05, desc="Opening PDF...") |
|
|
|
|
|
doc = fitz.open(path) |
|
|
total_pages = len(doc) |
|
|
|
|
|
if total_pages == 0: |
|
|
doc.close() |
|
|
return "❌ PDF is empty", "", "", None, [], {} |
|
|
|
|
|
if total_pages > 50: |
|
|
doc.close() |
|
|
return f"❌ PDF too large ({total_pages} pages). Maximum 50 pages allowed.", "", "", None, [], {} |
|
|
|
|
|
all_text = [] |
|
|
all_crops = [] |
|
|
all_confidence = [] |
|
|
|
|
|
progress(0.1, desc=f"Processing {total_pages} pages...") |
|
|
|
|
|
for page_num in range(total_pages): |
|
|
try: |
|
|
page_progress = 0.1 + (0.8 * page_num / total_pages) |
|
|
progress(page_progress, desc=f"Processing page {page_num + 1}/{total_pages}") |
|
|
|
|
|
|
|
|
page = doc.load_page(page_num) |
|
|
mat = fitz.Matrix(300/72, 300/72) |
|
|
pix = page.get_pixmap(matrix=mat, alpha=False) |
|
|
|
|
|
|
|
|
img_data = pix.tobytes("png") |
|
|
image = Image.open(BytesIO(img_data)) |
|
|
|
|
|
|
|
|
text, _, _, annotated_img, crops, confidence = process_image( |
|
|
image, mode, task, custom_prompt, language |
|
|
) |
|
|
|
|
|
if text and not text.startswith("❌"): |
|
|
all_text.append(f"## Page {page_num + 1}\n\n{text}") |
|
|
all_crops.extend(crops) |
|
|
all_confidence.append(confidence) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error processing page {page_num + 1}: {e}") |
|
|
all_text.append(f"## Page {page_num + 1}\n\n❌ Error processing this page: {str(e)}") |
|
|
|
|
|
doc.close() |
|
|
|
|
|
progress(0.95, desc="Finalizing results...") |
|
|
|
|
|
|
|
|
combined_text = "\n\n".join(all_text) |
|
|
|
|
|
|
|
|
combined_confidence = { |
|
|
"total_pages": total_pages, |
|
|
"processed_pages": len(all_confidence), |
|
|
"total_regions": sum(c.get("total_regions", 0) for c in all_confidence), |
|
|
"average_processing_time": np.mean([c.get("processing_time", 0) for c in all_confidence]) if all_confidence else 0, |
|
|
"language_detected": language, |
|
|
"pages_confidence": all_confidence |
|
|
} |
|
|
|
|
|
progress(1.0, desc="PDF processing complete!") |
|
|
|
|
|
return combined_text, combined_text, combined_text, None, all_crops, combined_confidence |
|
|
|
|
|
except Exception as e: |
|
|
return f"❌ Error processing PDF: {str(e)}", "", "", None, [], {} |
|
|
|
|
|
def process_file(path, mode, task, custom_prompt="", language="🌍 Auto-Detect", progress=gr.Progress()): |
|
|
""" |
|
|
Enhanced file processing function with language support |
|
|
""" |
|
|
if not path: |
|
|
return "❌ Error: Please upload a file", "", "", None, [], {} |
|
|
|
|
|
try: |
|
|
if path.lower().endswith('.pdf'): |
|
|
return process_pdf(path, mode, task, custom_prompt, language, progress) |
|
|
else: |
|
|
image = Image.open(path) |
|
|
return process_image(image, mode, task, custom_prompt, language, progress) |
|
|
except Exception as e: |
|
|
return f"❌ Error processing file: {str(e)}", "", "", None, [], {} |
|
|
|
|
|
def toggle_prompt(task): |
|
|
""" |
|
|
Toggle prompt visibility and configuration based on selected task |
|
|
""" |
|
|
if task == "✏️ Custom": |
|
|
return gr.update(visible=True, placeholder="Enter your custom prompt here...") |
|
|
else: |
|
|
task_config = TASK_PROMPTS.get(task, {}) |
|
|
description = task_config.get("description", "") |
|
|
return gr.update(visible=False, value="", placeholder=f"Using preset: {description}") |
|
|
|
|
|
def load_image(file_path): |
|
|
""" |
|
|
Enhanced image loading with better error handling |
|
|
""" |
|
|
if not file_path: |
|
|
return None |
|
|
|
|
|
try: |
|
|
if file_path.lower().endswith('.pdf'): |
|
|
doc = fitz.open(file_path) |
|
|
if len(doc) == 0: |
|
|
doc.close() |
|
|
return None |
|
|
page = doc.load_page(0) |
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False) |
|
|
img = Image.open(BytesIO(pix.tobytes("png"))) |
|
|
doc.close() |
|
|
return img |
|
|
else: |
|
|
return Image.open(file_path) |
|
|
except Exception as e: |
|
|
print(f"Error loading image: {e}") |
|
|
return None |
|
|
|
|
|
def format_confidence_display(confidence_data): |
|
|
""" |
|
|
Format confidence data for display |
|
|
""" |
|
|
if not confidence_data or "error" in confidence_data: |
|
|
return "❌ **Processing Failed**\n\nNo confidence data available." |
|
|
|
|
|
display = "📊 **Processing Statistics**\n\n" |
|
|
|
|
|
|
|
|
display += f"⏱️ **Processing Time**: {confidence_data.get('processing_time', 0):.2f}s\n" |
|
|
display += f"🌍 **Language**: {confidence_data.get('language_detected', 'Unknown')}\n" |
|
|
display += f"📦 **Regions Detected**: {confidence_data.get('total_regions', 0)}\n" |
|
|
display += f"⚙️ **Model Config**: {confidence_data.get('model_config', 'Unknown')}\n" |
|
|
display += f"🎯 **Task Type**: {confidence_data.get('task_type', 'Unknown')}\n" |
|
|
|
|
|
|
|
|
if 'image_size' in confidence_data: |
|
|
size = confidence_data['image_size'] |
|
|
display += f"🖼️ **Image Size**: {size[0]}×{size[1]}px\n" |
|
|
|
|
|
|
|
|
if 'confidence_scores' in confidence_data and confidence_data['confidence_scores']: |
|
|
scores = confidence_data['confidence_scores'] |
|
|
avg_conf = np.mean(scores) |
|
|
min_conf = np.min(scores) |
|
|
max_conf = np.max(scores) |
|
|
|
|
|
display += f"\n📈 **Confidence Analysis**\n" |
|
|
display += f"- Average: {avg_conf:.1%}\n" |
|
|
display += f"- Range: {min_conf:.1%} - {max_conf:.1%}\n" |
|
|
|
|
|
|
|
|
if 'total_pages' in confidence_data: |
|
|
display += f"\n📄 **PDF Statistics**\n" |
|
|
display += f"- Total Pages: {confidence_data.get('total_pages', 0)}\n" |
|
|
display += f"- Processed Pages: {confidence_data.get('processed_pages', 0)}\n" |
|
|
display += f"- Average Time/Page: {confidence_data.get('average_processing_time', 0):.2f}s\n" |
|
|
|
|
|
return display |
|
|
|
|
|
def create_custom_css(): |
|
|
""" |
|
|
Create custom CSS for enhanced UI styling |
|
|
""" |
|
|
return """ |
|
|
.main-header { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
background-image: url('https://res.cloudinary.com/dsmgydskc/image/upload/v1761752081/bg_banner_ua46qt.png'); |
|
|
background-size: cover; |
|
|
background-position: center; |
|
|
background-blend-mode: overlay; |
|
|
padding: 20px; |
|
|
border-radius: 10px; |
|
|
margin-bottom: 20px; |
|
|
box-shadow: 0 4px 15px rgba(0,0,0,0.2); |
|
|
} |
|
|
|
|
|
.input-panel { |
|
|
background: rgba(255, 255, 255, 0.05); |
|
|
backdrop-filter: blur(10px); |
|
|
border-radius: 15px; |
|
|
padding: 20px; |
|
|
margin-right: 10px; |
|
|
border: 1px solid rgba(255, 255, 255, 0.1); |
|
|
} |
|
|
|
|
|
.output-panel { |
|
|
background: rgba(255, 255, 255, 0.05); |
|
|
backdrop-filter: blur(10px); |
|
|
border-radius: 15px; |
|
|
padding: 20px; |
|
|
margin-left: 10px; |
|
|
border: 1px solid rgba(255, 255, 255, 0.1); |
|
|
} |
|
|
|
|
|
.primary-button { |
|
|
background: linear-gradient(45deg, #667eea, #764ba2) !important; |
|
|
border: none !important; |
|
|
color: white !important; |
|
|
font-weight: bold !important; |
|
|
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important; |
|
|
transition: all 0.3s ease !important; |
|
|
} |
|
|
|
|
|
.primary-button:hover { |
|
|
transform: translateY(-2px) !important; |
|
|
box-shadow: 0 6px 20px rgba(102, 126, 234, 0.6) !important; |
|
|
} |
|
|
|
|
|
.gradio-container { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
min-height: 100vh; |
|
|
} |
|
|
|
|
|
.gr-form { |
|
|
background: rgba(255, 255, 255, 0.1) !important; |
|
|
backdrop-filter: blur(10px) !important; |
|
|
border-radius: 15px !important; |
|
|
border: 1px solid rgba(255, 255, 255, 0.2) !important; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
with gr.Blocks( |
|
|
theme=gr.themes.Soft(), |
|
|
title="Multi-language & Handwritten Text Extraction Demo", |
|
|
css=create_custom_css() |
|
|
) as demo: |
|
|
|
|
|
|
|
|
with gr.Row(elem_classes="main-header"): |
|
|
gr.HTML(""" |
|
|
<div style="text-align: center;"> |
|
|
<h1 style="color: white; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); margin-bottom: 10px;"> |
|
|
🌍 Multi-language & Handwritten Text Extraction Demo |
|
|
</h1> |
|
|
<p style="color: rgba(255,255,255,0.9); font-size: 1.1em; margin-bottom: 5px;"> |
|
|
Powered by DeepSeek-OCR | Extract text from images and PDFs in multiple languages |
|
|
</p> |
|
|
<p style="color: rgba(255,255,255,0.8); font-size: 0.9em;"> |
|
|
✨ Support for handwritten text, tables, and 12+ languages with confidence scoring |
|
|
</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=1, elem_classes="input-panel"): |
|
|
gr.Markdown("### 📤 Input Configuration") |
|
|
|
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab("📁 File Upload"): |
|
|
file_in = gr.File( |
|
|
label="Upload Image or PDF", |
|
|
file_types=["image", ".pdf"], |
|
|
type="filepath" |
|
|
) |
|
|
|
|
|
gr.Markdown("### ⚙️ Processing Settings") |
|
|
|
|
|
language = gr.Dropdown( |
|
|
choices=list(SUPPORTED_LANGUAGES.keys()), |
|
|
value="🌍 Auto-Detect", |
|
|
label="Language" |
|
|
) |
|
|
|
|
|
mode = gr.Dropdown( |
|
|
choices=list(MODEL_CONFIGS.keys()), |
|
|
value="⚡ Gundam", |
|
|
label="Processing Mode" |
|
|
) |
|
|
|
|
|
task = gr.Dropdown( |
|
|
choices=list(TASK_PROMPTS.keys()), |
|
|
value="📋 Markdown", |
|
|
label="Task Type" |
|
|
) |
|
|
|
|
|
prompt = gr.Textbox( |
|
|
label="Custom Prompt", |
|
|
lines=3, |
|
|
visible=False, |
|
|
placeholder="Enter your custom prompt here..." |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
btn = gr.Button( |
|
|
"🚀 Extract Text", |
|
|
variant="primary", |
|
|
size="lg", |
|
|
elem_classes="primary-button" |
|
|
) |
|
|
clear_btn = gr.Button( |
|
|
"🗑️ Clear", |
|
|
variant="secondary", |
|
|
size="lg" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Column(scale=2, elem_classes="output-panel"): |
|
|
gr.Markdown("### 📊 Results") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab("📝 Extracted Text"): |
|
|
text_out = gr.Textbox( |
|
|
lines=15, |
|
|
show_copy_button=True, |
|
|
show_label=False, |
|
|
placeholder="Extracted text will appear here..." |
|
|
) |
|
|
|
|
|
with gr.Tab("🎨 Markdown"): |
|
|
md_out = gr.Markdown( |
|
|
value="Markdown output will appear here...", |
|
|
show_label=False |
|
|
) |
|
|
|
|
|
with gr.Tab("🖼️ Annotated Image"): |
|
|
img_out = gr.Image( |
|
|
type="pil", |
|
|
height=500, |
|
|
show_label=False |
|
|
) |
|
|
|
|
|
with gr.Tab("🖼️ Extracted Regions"): |
|
|
gallery = gr.Gallery( |
|
|
show_label=False, |
|
|
columns=3, |
|
|
height=400 |
|
|
) |
|
|
|
|
|
with gr.Tab("📊 Confidence & Stats"): |
|
|
confidence_out = gr.Markdown( |
|
|
value="Processing statistics will appear here...", |
|
|
show_label=False |
|
|
) |
|
|
|
|
|
with gr.Tab("🔍 Raw Output"): |
|
|
raw_out = gr.Textbox( |
|
|
lines=15, |
|
|
show_copy_button=True, |
|
|
show_label=False, |
|
|
placeholder="Raw model output will appear here..." |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
## ℹ️ Information |
|
|
|
|
|
### 🔧 Processing Modes |
|
|
- **Gundam**: 1024 base + 640 tiles with cropping - Best balance |
|
|
- **Tiny**: 512×512, no crop - Fastest |
|
|
- **Small**: 640×640, no crop - Quick |
|
|
- **Base**: 1024×1024, no crop - Standard |
|
|
- **Large**: 1280×1280, no crop - Highest quality |
|
|
|
|
|
### 📋 Task Types |
|
|
- **Markdown**: Convert document to structured markdown (grounding ✅) |
|
|
- **Free OCR**: Simple text extraction |
|
|
- **Locate**: Find specific text in image (grounding ✅) |
|
|
- **Describe**: General image description |
|
|
- **Handwritten**: Specialized handwritten text extraction (grounding ✅) |
|
|
- **Table Extract**: Extract and format table data (grounding ✅) |
|
|
- **Custom**: Your own prompt (add `<|grounding|>` for boxes) |
|
|
|
|
|
### 🌍 Language Support |
|
|
Supports 12+ languages including English, Arabic, Urdu, Chinese, Japanese, Korean, Spanish, French, German, Hindi, and Russian with automatic language detection. |
|
|
|
|
|
### 💡 Tips |
|
|
- Use **Gundam mode** for best results |
|
|
- **Handwritten task** works best for handwritten documents |
|
|
- **Table Extract** automatically formats tables into markdown |
|
|
- Confidence scores show model certainty for each detected region |
|
|
- PDF processing supports up to 50 pages |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_with_language(file_path, webcam_image, mode, task, prompt, language): |
|
|
"""Process image with language support""" |
|
|
try: |
|
|
|
|
|
input_source = file_path if file_path else webcam_image |
|
|
if not input_source: |
|
|
return ( |
|
|
"❌ Please upload a file or capture an image from webcam", |
|
|
"No input provided", |
|
|
"", |
|
|
None, |
|
|
[], |
|
|
"❌ **Error**: No input provided" |
|
|
) |
|
|
|
|
|
|
|
|
results = process_file(input_source, mode, task, prompt, language) |
|
|
|
|
|
|
|
|
text_result, md_result, raw_result, img_result, gallery_result, confidence_data = results |
|
|
|
|
|
|
|
|
confidence_display = format_confidence_display(confidence_data) |
|
|
|
|
|
return text_result, md_result, raw_result, img_result, gallery_result, confidence_display |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"❌ Processing failed: {str(e)}" |
|
|
return error_msg, error_msg, error_msg, None, [], error_msg |
|
|
|
|
|
|
|
|
def clear_all(): |
|
|
"""Clear all inputs and outputs""" |
|
|
return ( |
|
|
None, |
|
|
None, |
|
|
None, |
|
|
"", |
|
|
"Ready for new input...", |
|
|
"", |
|
|
None, |
|
|
[], |
|
|
"📊 **Ready**\n\nUpload an image or capture from webcam to begin processing." |
|
|
) |
|
|
|
|
|
|
|
|
btn.click( |
|
|
process_with_language, |
|
|
inputs=[file_in, mode, task, prompt, language], |
|
|
outputs=[text_out, md_out, raw_out, img_out, gallery, confidence_out] |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
clear_all, |
|
|
outputs=[file_in, text_out, md_out, raw_out, img_out, gallery, confidence_out] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|