llava-onevision

Running on Zero

App Files Files Community

RaushanTurganbay HF staff commited on Aug 20, 2024

Commit

daa724b

verified ·

1 Parent(s): 02b3e9b

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -7

app.py CHANGED Viewed

@@ -10,9 +10,9 @@ import spaces
 model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
-processor = LlavaProcessor.from_pretrained(model_id)
-model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
 model.to("cuda")
 def sample_frames(video_file, num_frames):
@@ -58,7 +58,7 @@ def bot_streaming(message, history):
   if len(image) == 1:
     if image[0].endswith(video_extensions):
-        image = sample_frames(image[0], 12)
         prompt = f"<|im_start|>user <video>\n{message.text}<|im_end|><|im_start|>assistant"
     elif image[0].endswith(image_extensions):
         image = Image.open(image[0]).convert("RGB")
@@ -105,12 +105,12 @@ def bot_streaming(message, history):
 demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Onevision", examples=[
-     {"text": "The input contains two videos, are the cats in this video and this video doing the same thing?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
-    {"text": "There are two images in the input. What is the relationship between this image and this image?", "files":["./bee.jpg", "./depth-bee.png"]},
-     {"text": "What are these cats doing?", "files":["./cats.mp4"]},
     {"text": "What is on the flower?", "files":["./bee.jpg"]},
-    {"text": "How to make this pastry?", "files":["./baklava.png"]}],
       textbox=gr.MultimodalTextbox(file_count="multiple"),
       description="Try [LLaVA Onevision](https://huggingface.co/docs/transformers/main/en/model_doc/llava_onevision) in this demo (more specifically, the [Qwen-2-0.5B-Instruct variant](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
       stop_btn="Stop Generation", multimodal=True)

 model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+processor = LlavaOnevisionProcessor.from_pretrained(model_id)
+model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
 model.to("cuda")
 def sample_frames(video_file, num_frames):
   if len(image) == 1:
     if image[0].endswith(video_extensions):
+        image = sample_frames(image[0], 32)
         prompt = f"<|im_start|>user <video>\n{message.text}<|im_end|><|im_start|>assistant"
     elif image[0].endswith(image_extensions):
         image = Image.open(image[0]).convert("RGB")
 demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Onevision", examples=[
+     {"text": "Do the cats in these two videos have same breed? What breed is each cat?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
+    {"text": "These are the tech specs of two laptops I am choosing from. Which one should I choose for office work?", "files":["./dell-tech-specs.jpeg", "./asus-tech-specs.png"]},
+     {"text": "Here are several images from a cooking book, showing how to prepare a meal step by step. Can you write a recipe for the meal, describing each step in details?", "files":["./step0.png", "./step1.png", "./step2.png", "./step3.png", "./step4.png", "./step5.png"]},
     {"text": "What is on the flower?", "files":["./bee.jpg"]},
+    {"text": "This is a video explaining how to create a Presentation in GoogleSlides. Can you write down what I should do step by step, following the video?", "files":["./tutorial.mp4"]}],
       textbox=gr.MultimodalTextbox(file_count="multiple"),
       description="Try [LLaVA Onevision](https://huggingface.co/docs/transformers/main/en/model_doc/llava_onevision) in this demo (more specifically, the [Qwen-2-0.5B-Instruct variant](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
       stop_btn="Stop Generation", multimodal=True)