Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -13,8 +13,6 @@ import numpy as np | |
| 13 | 
             
            from PIL import Image
         | 
| 14 | 
             
            import cv2
         | 
| 15 |  | 
| 16 | 
            -
            from keye_vl_utils import process_vision_info
         | 
| 17 | 
            -
             | 
| 18 | 
             
            from transformers import (
         | 
| 19 | 
             
                Qwen2_5_VLForConditionalGeneration,
         | 
| 20 | 
             
                AutoModel,
         | 
| @@ -24,9 +22,6 @@ from transformers import ( | |
| 24 | 
             
            )
         | 
| 25 | 
             
            from transformers.image_utils import load_image
         | 
| 26 |  | 
| 27 | 
            -
            import subprocess
         | 
| 28 | 
            -
            subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
         | 
| 29 | 
            -
             | 
| 30 | 
             
            # Constants for text generation
         | 
| 31 | 
             
            MAX_MAX_NEW_TOKENS = 2048
         | 
| 32 | 
             
            DEFAULT_MAX_NEW_TOKENS = 1024
         | 
| @@ -52,16 +47,6 @@ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| 52 | 
             
                torch_dtype=torch.float16
         | 
| 53 | 
             
            ).to(device).eval()
         | 
| 54 |  | 
| 55 | 
            -
            # Load Keye-VL-8B-Preview
         | 
| 56 | 
            -
            MODEL_ID_K = "Kwai-Keye/Keye-VL-8B-Preview"
         | 
| 57 | 
            -
            processor_k = AutoModel.from_pretrained(MODEL_ID_K, trust_remote_code=True)
         | 
| 58 | 
            -
            model_k = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         | 
| 59 | 
            -
                MODEL_ID_K,
         | 
| 60 | 
            -
                attn_implementation="flash_attention_2",
         | 
| 61 | 
            -
                trust_remote_code=True,
         | 
| 62 | 
            -
                torch_dtype=torch.float16
         | 
| 63 | 
            -
            ).to(device).eval()
         | 
| 64 | 
            -
             | 
| 65 | 
             
            def downsample_video(video_path):
         | 
| 66 | 
             
                """
         | 
| 67 | 
             
                Downsamples the video to evenly spaced frames.
         | 
| @@ -99,9 +84,6 @@ def generate_image(model_name: str, text: str, image: Image.Image, | |
| 99 | 
             
                elif model_name == "Qwen2.5-VL-3B-Instruct":
         | 
| 100 | 
             
                    processor = processor_x
         | 
| 101 | 
             
                    model = model_x
         | 
| 102 | 
            -
                elif model_name == "Keye-VL-8B-Preview":
         | 
| 103 | 
            -
                    processor = processor_k
         | 
| 104 | 
            -
                    model = model_k
         | 
| 105 | 
             
                else:
         | 
| 106 | 
             
                    yield "Invalid model selected."
         | 
| 107 | 
             
                    return
         | 
| @@ -152,9 +134,6 @@ def generate_video(model_name: str, text: str, video_path: str, | |
| 152 | 
             
                elif model_name == "Qwen2.5-VL-3B-Instruct":
         | 
| 153 | 
             
                    processor = processor_x
         | 
| 154 | 
             
                    model = model_x
         | 
| 155 | 
            -
                elif model_name == "Keye-VL-8B-Preview":
         | 
| 156 | 
            -
                    processor = processor_k
         | 
| 157 | 
            -
                    model = model_k
         | 
| 158 | 
             
                else:
         | 
| 159 | 
             
                    yield "Invalid model selected."
         | 
| 160 | 
             
                    return
         | 
| @@ -253,7 +232,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo: | |
| 253 | 
             
                    with gr.Column():
         | 
| 254 | 
             
                        output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
         | 
| 255 | 
             
                        model_choice = gr.Radio(
         | 
| 256 | 
            -
                            choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct" | 
| 257 | 
             
                            label="Select Model",
         | 
| 258 | 
             
                            value="Qwen2.5-VL-7B-Instruct"
         | 
| 259 | 
             
                        )
         | 
|  | |
| 13 | 
             
            from PIL import Image
         | 
| 14 | 
             
            import cv2
         | 
| 15 |  | 
|  | |
|  | |
| 16 | 
             
            from transformers import (
         | 
| 17 | 
             
                Qwen2_5_VLForConditionalGeneration,
         | 
| 18 | 
             
                AutoModel,
         | 
|  | |
| 22 | 
             
            )
         | 
| 23 | 
             
            from transformers.image_utils import load_image
         | 
| 24 |  | 
|  | |
|  | |
|  | |
| 25 | 
             
            # Constants for text generation
         | 
| 26 | 
             
            MAX_MAX_NEW_TOKENS = 2048
         | 
| 27 | 
             
            DEFAULT_MAX_NEW_TOKENS = 1024
         | 
|  | |
| 47 | 
             
                torch_dtype=torch.float16
         | 
| 48 | 
             
            ).to(device).eval()
         | 
| 49 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 50 | 
             
            def downsample_video(video_path):
         | 
| 51 | 
             
                """
         | 
| 52 | 
             
                Downsamples the video to evenly spaced frames.
         | 
|  | |
| 84 | 
             
                elif model_name == "Qwen2.5-VL-3B-Instruct":
         | 
| 85 | 
             
                    processor = processor_x
         | 
| 86 | 
             
                    model = model_x
         | 
|  | |
|  | |
|  | |
| 87 | 
             
                else:
         | 
| 88 | 
             
                    yield "Invalid model selected."
         | 
| 89 | 
             
                    return
         | 
|  | |
| 134 | 
             
                elif model_name == "Qwen2.5-VL-3B-Instruct":
         | 
| 135 | 
             
                    processor = processor_x
         | 
| 136 | 
             
                    model = model_x
         | 
|  | |
|  | |
|  | |
| 137 | 
             
                else:
         | 
| 138 | 
             
                    yield "Invalid model selected."
         | 
| 139 | 
             
                    return
         | 
|  | |
| 232 | 
             
                    with gr.Column():
         | 
| 233 | 
             
                        output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
         | 
| 234 | 
             
                        model_choice = gr.Radio(
         | 
| 235 | 
            +
                            choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct"],
         | 
| 236 | 
             
                            label="Select Model",
         | 
| 237 | 
             
                            value="Qwen2.5-VL-7B-Instruct"
         | 
| 238 | 
             
                        )
         | 
