Qari-Arabic-OCR

Running on Zero

App Files Files Community

Qari-Arabic-OCR / app.py

oddadmix

Update app.py

2116e87 verified 8 days ago

raw

history blame

2.14 kB

	import gradio as gr
	import time
	import spaces
	from PIL import Image
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info
	import torch
	import uuid
	import os


	model_name = "oddadmix/Qari-OCR-0.1-VL-2B-Instruct"
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	model_name,
	torch_dtype="auto",
	device_map="cuda"
	)
	processor = AutoProcessor.from_pretrained(model_name)
	max_tokens = 2000

	@spaces.GPU
	def perform_ocr(image):
	image = Image.fromarray(image)
	src = str(uuid.uuid4()) + ".png"

	prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
	image.save(src)

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": f"file://{src}"},
	{"type": "text", "text": prompt},
	],
	}
	]
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to("cuda")
	generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache = True)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]
	os.remove(src)
	print(output_text)
	return output_text

	iface = gr.Interface(
	fn=perform_ocr,
	inputs=gr.Image(type="numpy"),
	outputs=gr.Textbox(),
	live=True,
	title="Qari Arabic OCR",
	description="Upload an image to extract text in real-time."
	)

	iface.launch()