import time from threading import Thread import gradio as gr import spaces import torch from PIL import Image from transformers import AutoProcessor, AutoModelForCausalLM from transformers import TextIteratorStreamer import subprocess subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) PLACEHOLDER = """

microsoft/Phi-3-vision-128k-instruct

""" user_prompt = '<|user|>\n' assistant_prompt = '<|assistant|>\n' prompt_suffix = "<|end|>\n" model_id = "microsoft/Phi-3-vision-128k-instruct" processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype="auto", trust_remote_code=True, ) model.to("cuda:0") @spaces.GPU def bot_streaming(message, history): print(message) if message["files"]: # message["files"][-1] is a Dict or just a string if type(message["files"][-1]) == dict: image = message["files"][-1]["path"] else: image = message["files"][-1] else: # if there's no image uploaded for this turn, look for images in the past turns # kept inside tuples, take the last one for hist in history: if type(hist[0]) == tuple: image = hist[0][0] try: if image is None: # Handle the case where image is None gr.Error("You need to upload an image for Phi-3-vision to work.") except NameError: # Handle the case where 'image' is not defined at all gr.Error("You need to upload an image for Phi-3-vision to work.") # prompt = f"{message['text']}<|image_1|>\nCan you convert the table to markdown format?{prompt_suffix}{assistant_prompt}" chat = [ {"role": "user", "content": f"<|image_1|>\n{message['text']}"}, ] prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) # need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end. if prompt.endswith("<|endoftext|>"): prompt = prompt.rstrip("<|endoftext|>") print(f">>> Prompt\n{prompt})") image = Image.open(image) inputs = processor(prompt, [image], return_tensors='pt').to("cuda:0") streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False, "skip_prompt": True}) generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, do_sample=False, eos_token_id=processor.tokenizer.eos_token_id) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() buffer = "" time.sleep(0.5) for new_text in streamer: buffer += new_text generated_text_without_prompt = buffer # print(generated_text_without_prompt) time.sleep(0.06) # print(f"new_text: {generated_text_without_prompt}") yield generated_text_without_prompt chatbot = gr.Chatbot(placeholder=PLACEHOLDER, scale=1) chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False) with gr.Blocks(fill_height=True, ) as demo: gr.ChatInterface( fn=bot_streaming, title="Phi-3 Vision 128k Instruct", examples=[{"text": "What is on the flower?", "files": ["./bee.jpg"]}, {"text": "How to make this pastry?", "files": ["./baklava.png"]}], description="Try [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.", stop_btn="Stop Generation", multimodal=True, textbox=chat_input, chatbot=chatbot, ) demo.queue(api_open=False) demo.launch(show_api=False, share=False)