RaushanTurganbay HF staff commited on
Commit
daa724b
·
verified ·
1 Parent(s): 02b3e9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -10,9 +10,9 @@ import spaces
10
 
11
  model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
12
 
13
- processor = LlavaProcessor.from_pretrained(model_id)
14
 
15
- model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
16
  model.to("cuda")
17
 
18
  def sample_frames(video_file, num_frames):
@@ -58,7 +58,7 @@ def bot_streaming(message, history):
58
  if len(image) == 1:
59
  if image[0].endswith(video_extensions):
60
 
61
- image = sample_frames(image[0], 12)
62
  prompt = f"<|im_start|>user <video>\n{message.text}<|im_end|><|im_start|>assistant"
63
  elif image[0].endswith(image_extensions):
64
  image = Image.open(image[0]).convert("RGB")
@@ -105,12 +105,12 @@ def bot_streaming(message, history):
105
 
106
 
107
  demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Onevision", examples=[
108
- {"text": "The input contains two videos, are the cats in this video and this video doing the same thing?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
109
- {"text": "There are two images in the input. What is the relationship between this image and this image?", "files":["./bee.jpg", "./depth-bee.png"]},
110
- {"text": "What are these cats doing?", "files":["./cats.mp4"]},
111
 
112
  {"text": "What is on the flower?", "files":["./bee.jpg"]},
113
- {"text": "How to make this pastry?", "files":["./baklava.png"]}],
114
  textbox=gr.MultimodalTextbox(file_count="multiple"),
115
  description="Try [LLaVA Onevision](https://huggingface.co/docs/transformers/main/en/model_doc/llava_onevision) in this demo (more specifically, the [Qwen-2-0.5B-Instruct variant](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
116
  stop_btn="Stop Generation", multimodal=True)
 
10
 
11
  model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
12
 
13
+ processor = LlavaOnevisionProcessor.from_pretrained(model_id)
14
 
15
+ model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
16
  model.to("cuda")
17
 
18
  def sample_frames(video_file, num_frames):
 
58
  if len(image) == 1:
59
  if image[0].endswith(video_extensions):
60
 
61
+ image = sample_frames(image[0], 32)
62
  prompt = f"<|im_start|>user <video>\n{message.text}<|im_end|><|im_start|>assistant"
63
  elif image[0].endswith(image_extensions):
64
  image = Image.open(image[0]).convert("RGB")
 
105
 
106
 
107
  demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Onevision", examples=[
108
+ {"text": "Do the cats in these two videos have same breed? What breed is each cat?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
109
+ {"text": "These are the tech specs of two laptops I am choosing from. Which one should I choose for office work?", "files":["./dell-tech-specs.jpeg", "./asus-tech-specs.png"]},
110
+ {"text": "Here are several images from a cooking book, showing how to prepare a meal step by step. Can you write a recipe for the meal, describing each step in details?", "files":["./step0.png", "./step1.png", "./step2.png", "./step3.png", "./step4.png", "./step5.png"]},
111
 
112
  {"text": "What is on the flower?", "files":["./bee.jpg"]},
113
+ {"text": "This is a video explaining how to create a Presentation in GoogleSlides. Can you write down what I should do step by step, following the video?", "files":["./tutorial.mp4"]}],
114
  textbox=gr.MultimodalTextbox(file_count="multiple"),
115
  description="Try [LLaVA Onevision](https://huggingface.co/docs/transformers/main/en/model_doc/llava_onevision) in this demo (more specifically, the [Qwen-2-0.5B-Instruct variant](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
116
  stop_btn="Stop Generation", multimodal=True)