shervin-dadashzadeh's picture
torch
13b328e
import gradio as gr
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from util.vision_util import process_vision_info
import json
from PIL import Image
# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
"datamoon/qwen2-vl-iranian-idcard-ocr",
device_map="auto"
)
processor = AutoProcessor.from_pretrained(
"datamoon/qwen2-vl-iranian-idcard-ocr",
padding_side="left"
)
def process_id_card(image_path):
try:
# Prepare the message with image and instruction
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image_path},
{"type": "text", "text": """From this image which is a persian national id card,
return a JSON object with these exact fields:
{
"national_id": "...",
"first_name": "...",
"last_name": "...",
"date_of_birth": "...",
"father_name": "...",
"expiry_date": "..."
}
Return ONLY the JSON object, nothing else."""}
]
}]
# Process vision inputs
image_inputs, _ = process_vision_info(messages)
# Prepare text input
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Process inputs
inputs = processor(
text=text,
images=image_inputs,
return_tensors="pt",
).to(model.device)
# Generate response
generated_ids = model.generate(
**inputs,
max_new_tokens=256,
do_sample=False
)
# Decode and clean output
generated_text = processor.batch_decode(
generated_ids[:, inputs.input_ids.shape[1]:],
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
# Try to extract JSON from the output
try:
# Find JSON start and end
json_start = generated_text.find('{')
json_end = generated_text.rfind('}') + 1
json_str = generated_text[json_start:json_end]
# Parse and validate JSON
result = json.loads(json_str)
required_fields = [
"national_id", "first_name", "last_name",
"date_of_birth", "father_name", "expiry_date"
]
for field in required_fields:
if field not in result:
raise ValueError(f"Missing field: {field}")
return result
except (json.JSONDecodeError, ValueError) as e:
return {"error": f"Could not parse model output: {str(e)}", "raw_output": generated_text}
except Exception as e:
return {"error": str(e)}
# Create Gradio interface
iface = gr.Interface(
fn=process_id_card,
inputs=gr.Image(type="filepath", label="Upload ID Card Image"),
outputs=gr.JSON(label="Extracted Information"),
title="Persian ID Card Reader",
description="""Upload an image of an Iranian national ID card to extract information.
The system will return: national_id, first_name, last_name, date_of_birth, father_name, and expiry_date.""",
examples=[
["examples/id1.png"]
],
allow_flagging="never"
)
# Launch with queue for better performance
iface.launch(
server_name="0.0.0.0",
share=False,
debug=False
)