Italian_OCR / app.py
DeepMount00's picture
Update app.py
aeee287 verified
import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
import re
from PIL import Image
import spaces # Add spaces import for Hugging Face Spaces
import os
import sys
import logging
from huggingface_hub import HfFolder
hf_token = os.getenv("API_KEY")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# If the key is found, use it to authenticate
if hf_token:
HfFolder.save_token(hf_token) # This authenticates you for this session
else:
print("No HF_KEY found. Please make sure you've set up your Hugging Face API key as an environment variable.")
# Model information
MODEL_ID = "DeepMount00/Smol-OCR-preview"
OCR_INSTRUCTION = "Sei un assistente esperto di OCR, converti il testo in formato MD."
# Load processor and model
processor = AutoProcessor.from_pretrained(MODEL_ID, token=hf_token)
model = AutoModelForVision2Seq.from_pretrained(
MODEL_ID,
token=hf_token,
torch_dtype=torch.bfloat16,
# _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to("cuda") # Ensure model loads on CUDA for Spaces
@spaces.GPU # Add spaces.GPU decorator for GPU acceleration
def process_image(image, progress=gr.Progress()):
if image is None:
gr.Error("Please upload an image to process.")
return "Please upload an image to process."
progress(0, desc="Starting OCR processing...")
# Convert from Gradio's image format to PIL
if isinstance(image, str):
image = Image.open(image).convert("RGB")
progress(0.2, desc="Preparing image...")
# Create input messages - note that the instruction is included as part of the user message
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": OCR_INSTRUCTION}
]
},
]
# Prepare inputs
progress(0.4, desc="Processing with model...")
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to('cuda')
# Generate outputs
progress(0.6, desc="Generating text...")
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=4096,
temperature=0.1,
do_sample=True
)
# Decode outputs
progress(0.8, desc="Finalizing results...")
generated_text = processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0]
# Extract only the assistant's response
# Remove any "User:" and "Assistant:" prefixes if present
cleaned_text = generated_text
# Remove user prompt and "User:" prefix if present
user_pattern = r"User:.*?(?=Assistant:|$)"
cleaned_text = re.sub(user_pattern, "", cleaned_text, flags=re.DOTALL)
# Remove "Assistant:" prefix if present
assistant_pattern = r"Assistant:\s*"
cleaned_text = re.sub(assistant_pattern, "", cleaned_text)
# Clean up any extra whitespace
cleaned_text = cleaned_text.strip()
progress(1.0, desc="Done!")
return cleaned_text # Return only the cleaned text
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# OCR to Markdown Converter")
gr.Markdown(f"Upload Italian text images for instant Markdown conversion.Powered by {MODEL_ID} technology for exceptional accuracy with Italian language documents.")
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(type="pil", label="Upload an image containing text")
submit_btn = gr.Button("Process Image", variant="primary")
with gr.Column(scale=1):
output_text = gr.Textbox(label="Raw Text", lines=15)
copy_btn = gr.Button("Select All Text", variant="secondary")
submit_btn.click(
fn=process_image,
inputs=input_image,
outputs=output_text,
show_progress="full",
queue=True # Enable queue for Spaces
)
def copy_to_clipboard(text):
return text
copy_btn.click(
fn=copy_to_clipboard,
inputs=output_text,
outputs=output_text
)
# Launch the app with default Spaces configuration (no need for local file paths)
demo.launch()