Spaces:

cafierom
/

OCR

Sleeping

App Files Files Community

OCR / app.py

cafierom

Update app.py

533ddd5 verified 21 days ago

raw

history blame contribute delete

1.96 kB

	import spaces
	from transformers import AutoProcessor, AutoModelForImageTextToText
	import torch
	import gradio as gr

	device = "cuda" if torch.cuda.is_available() else "cpu"

	MODEL_PATH = "zai-org/GLM-OCR"
	processor = AutoProcessor.from_pretrained(MODEL_PATH)
	model = AutoModelForImageTextToText.from_pretrained(
	pretrained_model_name_or_path=MODEL_PATH,
	torch_dtype="auto",
	device_map="auto",
	).to(device)



	@spaces.GPU
	def read_img(img):
	'''
	Takes in an image file and returns the text recognized from the image.
	Args:
	img: the input image file
	Returns:
	output_text: a string of the text recognized from the image
	'''
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image",
	"url": img},
	{"type": "text",
	"text": "Text Recognition:"}],
	}
	]

	inputs = processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt"
	).to(device)

	inputs.pop("token_type_ids", None)
	generated_ids = model.generate(**inputs, max_new_tokens=8192)
	output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)

	return output_text

	with gr.Blocks() as imgsmiles:
	top = gr.Markdown(
	"""
	# OCR with ZAI GLM
	""")

	agent_flag_choice = gr.Radio(choices = ['True', 'False'],label="Are you an Agent?", interactive=True, value='False', scale = 2)
	with gr.Row():
	inputs=gr.Image(type="filepath")
	text_out = gr.Textbox(lines=2, label="Text Output")

	submit_button = gr.Button("Submit")
	clear_button = gr.ClearButton([inputs, text_out], value = "Clear")
	# agent_button = gr.Button("Agent use only")

	submit_button.click(read_img, [inputs], [text_out])
	# agent_button.click(agent_read_img, [agent_flag_choice, inputs], [text_out, None])

	imgsmiles.launch(mcp_server=True)