| | import spaces |
| | from transformers import AutoProcessor, AutoModelForImageTextToText |
| | import torch |
| | import gradio as gr |
| |
|
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| |
|
| | MODEL_PATH = "zai-org/GLM-OCR" |
| | processor = AutoProcessor.from_pretrained(MODEL_PATH) |
| | model = AutoModelForImageTextToText.from_pretrained( |
| | pretrained_model_name_or_path=MODEL_PATH, |
| | torch_dtype="auto", |
| | device_map="auto", |
| | ).to(device) |
| |
|
| |
|
| |
|
| | @spaces.GPU |
| | def read_img(img): |
| | ''' |
| | Takes in an image file and returns the text recognized from the image. |
| | Args: |
| | img: the input image file |
| | Returns: |
| | output_text: a string of the text recognized from the image |
| | ''' |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "image", |
| | "url": img}, |
| | {"type": "text", |
| | "text": "Text Recognition:"}], |
| | } |
| | ] |
| |
|
| | inputs = processor.apply_chat_template( |
| | messages, |
| | tokenize=True, |
| | add_generation_prompt=True, |
| | return_dict=True, |
| | return_tensors="pt" |
| | ).to(device) |
| |
|
| | inputs.pop("token_type_ids", None) |
| | generated_ids = model.generate(**inputs, max_new_tokens=8192) |
| | output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False) |
| |
|
| | return output_text |
| |
|
| | with gr.Blocks() as imgsmiles: |
| | top = gr.Markdown( |
| | """ |
| | # OCR with ZAI GLM |
| | """) |
| |
|
| | agent_flag_choice = gr.Radio(choices = ['True', 'False'],label="Are you an Agent?", interactive=True, value='False', scale = 2) |
| | with gr.Row(): |
| | inputs=gr.Image(type="filepath") |
| | text_out = gr.Textbox(lines=2, label="Text Output") |
| |
|
| | submit_button = gr.Button("Submit") |
| | clear_button = gr.ClearButton([inputs, text_out], value = "Clear") |
| | |
| |
|
| | submit_button.click(read_img, [inputs], [text_out]) |
| | |
| |
|
| | imgsmiles.launch(mcp_server=True) |