Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,625 Bytes
6b880cc ecadc68 9fd817b 6b880cc ecadc68 f0edecc ecadc68 f7d34c1 a6cf185 f0edecc ecadc68 f7d34c1 ecadc68 f0edecc ecadc68 f0edecc ecadc68 f0edecc ecadc68 f0edecc ecadc68 a6cf185 ecadc68 f0edecc 6b880cc ecadc68 6b880cc ecadc68 f0edecc ecadc68 f0edecc ecadc68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
from transformers import MllamaForConditionalGeneration, AutoProcessor
from PIL import Image
import torch
import gradio as gr
import spaces
# Initialize model and processor
ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
ckpt,
torch_dtype=torch.bfloat16
).to("cuda")
processor = AutoProcessor.from_pretrained(ckpt)
@spaces.GPU
def extract_text(image):
# Convert image to RGB
image = Image.open(image).convert("RGB")
prompt = (
"Output ONLY the raw text exactly as it appears in the image. Do not add anything.\n\n"
"The image may contain both handwritten and printed text in French and/or English, including punctuation and underscores.\n\n"
"Your task: Transcribe all visible text exactly, preserving:\n"
"- All characters, accents, punctuation, spacing, and line breaks.\n"
"- The original reading order and layout, including tables and forms if present.\n\n"
"Rules:\n"
"- Do NOT add any explanations, summaries, comments, or extra text.\n"
"- Do NOT duplicate any content.\n"
"- Do NOT indicate blank space.\n"
"- Do NOT separate handwritten and printed text.\n"
"- Do NOT confuse '.' (a period) with '|' (a border).\n\n"
"Only extract the text that is actually visible in the image, and nothing else.")
# Create message structure
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image"}
]
}
]
# Process input
texts = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda")
# Generate output
outputs = model.generate(**inputs, max_new_tokens=250)
result = processor.decode(outputs[0], skip_special_tokens=True)
print(result)
# Clean up the output to remove the prompt and assistant text
if "assistant" in result.lower():
result = result[result.lower().find("assistant") + len("assistant"):].strip()
# Remove any remaining conversation markers
result = result.replace("user", "").replace(prompt, "").strip()
print(result)
return result
# Create Gradio interface
demo = gr.Interface(
fn=extract_text,
inputs=gr.Image(type="filepath", label="Upload Image"),
outputs=gr.Textbox(label="Extracted Text"),
title="Handwritten Text Extractor",
description="Upload an image containing handwritten text to extract its content.",
)
# Launch the app
demo.launch(debug=True) |