import gradio as gr import spaces import torch import os import subprocess # os.system("pip install git+https://github.com/huggingface/transformers") from PIL import Image import requests from transformers import AutoModelForCausalLM from transformers import AutoProcessor model_id = "microsoft/Phi-3-vision-128k-instruct" model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", _attn_implementation='eager').cuda() # use _attn_implementation='eager' to disable flash attention processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) @spaces.GPU def infer(u, t): if len(u) < 1: u = "https://lf3-static.bytednsdoc.com/obj/eden-cn/pbovhozuha/output.png" if len(t) < 1: t = "Convert the text in the image to markdown" messages = messages = [ {"role": "user", "content": "<|image_1|>\n" + t}, ] url = u image = Image.open(requests.get(url, stream=True).raw) prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = processor(prompt, [image], return_tensors="pt").to(model.device) generation_args = { "max_new_tokens": 512, "temperature": 0.7, "do_sample": True, } generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args) # remove input tokens generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] return response demo = gr.Interface( fn=infer, inputs=[ gr.Text(label="url"), gr.Text(label="text"), ], outputs=gr.Text(), ) demo.launch()