Spaces:

Tonic
/

Llava-Video

Running on Zero

App Files Files Community

Tonic commited on Oct 4, 2024

Commit

b297085

unverified ·

1 Parent(s): 4fa7fae

add gradio interface

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +30 -8

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Llava Video
-emoji: 🌖
 colorFrom: purple
 colorTo: green
 sdk: gradio

 ---
 title: Llava Video
+emoji: 🌋📹
 colorFrom: purple
 colorTo: green
 sdk: gradio

app.py CHANGED Viewed

@@ -25,7 +25,26 @@ import tempfile
 import os
 import shutil
 #warnings.filterwarnings("ignore")
 def load_video(video_path, max_frames_num, fps=1, force_sample=False):
     if max_frames_num == 0:
         return np.zeros((1, 336, 336, 3))
@@ -94,14 +113,17 @@ def gradio_interface(video_file, question):
     return response
 with gr.Blocks() as demo:
-    gr.Markdown("# LLaVA-Video-7B-Qwen2 Demo")
-    gr.Markdown("Upload a video and ask a question about it.")
     with gr.Row():
-        video_input = gr.Video()
-        question_input = gr.Textbox(label="Question", placeholder="Ask a question about the video...")
-    submit_button = gr.Button("Submit")
     output = gr.Textbox(label="Response")
     submit_button.click(

 import os
 import shutil
 #warnings.filterwarnings("ignore")
+title = "# 🙋🏻‍♂️Welcome to 🌟Tonic's 🌋📹LLaVA-Video!"
+description1 ="""The **🌋📹LLaVA-Video-7B-Qwen2** is a 7B parameter model  trained on the 🌋📹LLaVA-Video-178K dataset and the LLaVA-OneVision dataset. It is [based on the **Qwen2 language model**](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f), supporting a context window of up to 32K tokens. The model can process and interact with images, multi-images, and videos, with specific optimizations for video analysis.
+This model leverages the **SO400M vision backbone** for visual input and Qwen2 for language processing, making it highly efficient in multi-modal reasoning, including visual and video-based tasks.
+🌋📹LLaVA-Video has larger variants of [32B](https://huggingface.co/lmms-lab/LLaVA-NeXT-Video-32B-Qwen) and [72B](https://huggingface.co/lmms-lab/LLaVA-Video-72B-Qwen2) and with a [variant](https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2-Video-Only only trained on the new synthetic data
+For further details, please visit the [Project Page](https://github.com/LLaVA-VL/LLaVA-NeXT) or check out the corresponding [research paper](https://arxiv.org/abs/2410.02713).
+"""
+description2 ="""- **Architecture**: `LlavaQwenForCausalLM`
+- **Attention Heads**: 28
+- **Hidden Layers**: 28
+- **Hidden Size**: 3584
+- **Intermediate Size**: 18944
+- **Max Frames Supported**: 64
+- **Languages Supported**: English, Chinese
+- **Image Aspect Ratio**: `anyres_max_9`
+- **Image Resolution**: Various grid resolutions
+- **Max Position Embeddings**: 32,768
+- **Vocab Size**: 152,064
+- **Model Precision**: bfloat16
+- **Hardware Used for Training**: 256 * Nvidia Tesla A100 GPUs
+"""
 def load_video(video_path, max_frames_num, fps=1, force_sample=False):
     if max_frames_num == 0:
         return np.zeros((1, 336, 336, 3))
     return response
 with gr.Blocks() as demo:
+    gr.Markdown(title)
     with gr.Row():
+        with gr.Group():
+            gr.Markdown(description1)
+        with gr.Group():
+            gr.Markdown(description1)
+    with gr.Row():
+        with gr.Column():
+            video_input = gr.Video()
+            question_input = gr.Textbox(label="Question", placeholder="Ask a question about the video...")
+            submit_button = gr.Button("Submit")
     output = gr.Textbox(label="Response")
     submit_button.click(