Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -10,9 +10,9 @@ import spaces
|
|
10 |
|
11 |
model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
12 |
|
13 |
-
processor =
|
14 |
|
15 |
-
model =
|
16 |
model.to("cuda")
|
17 |
|
18 |
def sample_frames(video_file, num_frames):
|
@@ -58,7 +58,7 @@ def bot_streaming(message, history):
|
|
58 |
if len(image) == 1:
|
59 |
if image[0].endswith(video_extensions):
|
60 |
|
61 |
-
image = sample_frames(image[0],
|
62 |
prompt = f"<|im_start|>user <video>\n{message.text}<|im_end|><|im_start|>assistant"
|
63 |
elif image[0].endswith(image_extensions):
|
64 |
image = Image.open(image[0]).convert("RGB")
|
@@ -105,12 +105,12 @@ def bot_streaming(message, history):
|
|
105 |
|
106 |
|
107 |
demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Onevision", examples=[
|
108 |
-
{"text": "
|
109 |
-
{"text": "
|
110 |
-
{"text": "
|
111 |
|
112 |
{"text": "What is on the flower?", "files":["./bee.jpg"]},
|
113 |
-
{"text": "
|
114 |
textbox=gr.MultimodalTextbox(file_count="multiple"),
|
115 |
description="Try [LLaVA Onevision](https://huggingface.co/docs/transformers/main/en/model_doc/llava_onevision) in this demo (more specifically, the [Qwen-2-0.5B-Instruct variant](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
|
116 |
stop_btn="Stop Generation", multimodal=True)
|
|
|
10 |
|
11 |
model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
12 |
|
13 |
+
processor = LlavaOnevisionProcessor.from_pretrained(model_id)
|
14 |
|
15 |
+
model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
|
16 |
model.to("cuda")
|
17 |
|
18 |
def sample_frames(video_file, num_frames):
|
|
|
58 |
if len(image) == 1:
|
59 |
if image[0].endswith(video_extensions):
|
60 |
|
61 |
+
image = sample_frames(image[0], 32)
|
62 |
prompt = f"<|im_start|>user <video>\n{message.text}<|im_end|><|im_start|>assistant"
|
63 |
elif image[0].endswith(image_extensions):
|
64 |
image = Image.open(image[0]).convert("RGB")
|
|
|
105 |
|
106 |
|
107 |
demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Onevision", examples=[
|
108 |
+
{"text": "Do the cats in these two videos have same breed? What breed is each cat?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
|
109 |
+
{"text": "These are the tech specs of two laptops I am choosing from. Which one should I choose for office work?", "files":["./dell-tech-specs.jpeg", "./asus-tech-specs.png"]},
|
110 |
+
{"text": "Here are several images from a cooking book, showing how to prepare a meal step by step. Can you write a recipe for the meal, describing each step in details?", "files":["./step0.png", "./step1.png", "./step2.png", "./step3.png", "./step4.png", "./step5.png"]},
|
111 |
|
112 |
{"text": "What is on the flower?", "files":["./bee.jpg"]},
|
113 |
+
{"text": "This is a video explaining how to create a Presentation in GoogleSlides. Can you write down what I should do step by step, following the video?", "files":["./tutorial.mp4"]}],
|
114 |
textbox=gr.MultimodalTextbox(file_count="multiple"),
|
115 |
description="Try [LLaVA Onevision](https://huggingface.co/docs/transformers/main/en/model_doc/llava_onevision) in this demo (more specifically, the [Qwen-2-0.5B-Instruct variant](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
|
116 |
stop_btn="Stop Generation", multimodal=True)
|