import gradio as gr from transformers import AutoProcessor, AutoModelForVision2Seq import torch import re from PIL import Image import spaces # Add spaces import for Hugging Face Spaces import os import sys import logging from huggingface_hub import HfFolder hf_token = os.getenv("API_KEY") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # If the key is found, use it to authenticate if hf_token: HfFolder.save_token(hf_token) # This authenticates you for this session else: print("No HF_KEY found. Please make sure you've set up your Hugging Face API key as an environment variable.") # Model information MODEL_ID = "DeepMount00/Smol-OCR-preview" OCR_INSTRUCTION = "Sei un assistente esperto di OCR, converti il testo in formato MD." # Load processor and model processor = AutoProcessor.from_pretrained(MODEL_ID, token=hf_token) model = AutoModelForVision2Seq.from_pretrained( MODEL_ID, token=hf_token, torch_dtype=torch.bfloat16, # _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager", ).to("cuda") # Ensure model loads on CUDA for Spaces @spaces.GPU # Add spaces.GPU decorator for GPU acceleration def process_image(image, progress=gr.Progress()): if image is None: gr.Error("Please upload an image to process.") return "Please upload an image to process." progress(0, desc="Starting OCR processing...") # Convert from Gradio's image format to PIL if isinstance(image, str): image = Image.open(image).convert("RGB") progress(0.2, desc="Preparing image...") # Create input messages - note that the instruction is included as part of the user message messages = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": OCR_INSTRUCTION} ] }, ] # Prepare inputs progress(0.4, desc="Processing with model...") prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") inputs = inputs.to('cuda') # Generate outputs progress(0.6, desc="Generating text...") with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=4096, temperature=0.1, do_sample=True ) # Decode outputs progress(0.8, desc="Finalizing results...") generated_text = processor.batch_decode( generated_ids, skip_special_tokens=True )[0] # Extract only the assistant's response # Remove any "User:" and "Assistant:" prefixes if present cleaned_text = generated_text # Remove user prompt and "User:" prefix if present user_pattern = r"User:.*?(?=Assistant:|$)" cleaned_text = re.sub(user_pattern, "", cleaned_text, flags=re.DOTALL) # Remove "Assistant:" prefix if present assistant_pattern = r"Assistant:\s*" cleaned_text = re.sub(assistant_pattern, "", cleaned_text) # Clean up any extra whitespace cleaned_text = cleaned_text.strip() progress(1.0, desc="Done!") return cleaned_text # Return only the cleaned text # Create Gradio interface with gr.Blocks() as demo: gr.Markdown("# OCR to Markdown Converter") gr.Markdown(f"Upload Italian text images for instant Markdown conversion.Powered by {MODEL_ID} technology for exceptional accuracy with Italian language documents.") with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(type="pil", label="Upload an image containing text") submit_btn = gr.Button("Process Image", variant="primary") with gr.Column(scale=1): output_text = gr.Textbox(label="Raw Text", lines=15) copy_btn = gr.Button("Select All Text", variant="secondary") submit_btn.click( fn=process_image, inputs=input_image, outputs=output_text, show_progress="full", queue=True # Enable queue for Spaces ) def copy_to_clipboard(text): return text copy_btn.click( fn=copy_to_clipboard, inputs=output_text, outputs=output_text ) # Launch the app with default Spaces configuration (no need for local file paths) demo.launch()