File size: 2,625 Bytes
6b880cc
ecadc68
 
 
 
 
 
 
 
9fd817b
6b880cc
 
ecadc68
 
 
 
f0edecc
ecadc68
f7d34c1
a6cf185
 
 
 
 
 
 
 
 
 
 
 
f0edecc
ecadc68
 
 
 
f7d34c1
ecadc68
 
 
 
 
f0edecc
ecadc68
 
f0edecc
ecadc68
f0edecc
ecadc68
 
 
 
 
f0edecc
ecadc68
a6cf185
ecadc68
f0edecc
6b880cc
ecadc68
6b880cc
ecadc68
 
 
f0edecc
ecadc68
 
 
 
 
 
 
 
f0edecc
ecadc68
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from transformers import MllamaForConditionalGeneration, AutoProcessor
from PIL import Image
import torch
import gradio as gr
import spaces

# Initialize model and processor
ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
    ckpt,
    torch_dtype=torch.bfloat16
).to("cuda")
processor = AutoProcessor.from_pretrained(ckpt)

@spaces.GPU
def extract_text(image):
    # Convert image to RGB
    image = Image.open(image).convert("RGB")
    prompt = (
    "Output ONLY the raw text exactly as it appears in the image. Do not add anything.\n\n"
    "The image may contain both handwritten and printed text in French and/or English, including punctuation and underscores.\n\n"
    "Your task: Transcribe all visible text exactly, preserving:\n"
    "- All characters, accents, punctuation, spacing, and line breaks.\n"
    "- The original reading order and layout, including tables and forms if present.\n\n"
    "Rules:\n"
    "- Do NOT add any explanations, summaries, comments, or extra text.\n"
    "- Do NOT duplicate any content.\n"
    "- Do NOT indicate blank space.\n"
    "- Do NOT separate handwritten and printed text.\n"
    "- Do NOT confuse '.' (a period) with '|' (a border).\n\n"
    "Only extract the text that is actually visible in the image, and nothing else.")
    # Create message structure
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image"}
            ]
        }
    ]
    
    # Process input
    texts = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda")

    
    # Generate output
    outputs = model.generate(**inputs, max_new_tokens=250)
    result = processor.decode(outputs[0], skip_special_tokens=True)

    print(result)
    
    # Clean up the output to remove the prompt and assistant text
    if "assistant" in result.lower():
        result = result[result.lower().find("assistant") + len("assistant"):].strip()
    
    # Remove any remaining conversation markers
    result = result.replace("user", "").replace(prompt, "").strip()

    print(result)
    
    return result

# Create Gradio interface
demo = gr.Interface(
    fn=extract_text,
    inputs=gr.Image(type="filepath", label="Upload Image"),
    outputs=gr.Textbox(label="Extracted Text"),
    title="Handwritten Text Extractor",
    description="Upload an image containing handwritten text to extract its content.",
)

# Launch the app
demo.launch(debug=True)