ManishThota commited on
Commit
9f373ac
1 Parent(s): a408fb5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
2
+ import torch
3
+ import numpy as np
4
+ import av
5
+ import spaces
6
+ import gradio as gr
7
+ import os
8
+
9
+ quantization_config = BitsAndBytesConfig(
10
+ load_in_4bit=True,
11
+ bnb_4bit_compute_dtype=torch.float16
12
+ )
13
+
14
+ model_name = 'llava-hf/LLaVA-NeXT-Video-7B-DPO-hf'
15
+
16
+ processor = LlavaNextVideoProcessor.from_pretrained(model_name)
17
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
18
+ model_name,
19
+ quantization_config=quantization_config,
20
+ device_map='auto'
21
+ )
22
+
23
+ @spaces.GPU
24
+ def read_video_pyav(container, indices):
25
+ '''
26
+ Decode the video with PyAV decoder.
27
+ Args:
28
+ container (av.container.input.InputContainer): PyAV container.
29
+ indices (List[int]): List of frame indices to decode.
30
+ Returns:
31
+ np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
32
+ '''
33
+ frames = []
34
+ container.seek(0)
35
+ start_index = indices[0]
36
+ end_index = indices[-1]
37
+ for i, frame in enumerate(container.decode(video=0)):
38
+ if i > end_index:
39
+ break
40
+ if i >= start_index and i in indices:
41
+ frames.append(frame)
42
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
43
+
44
+ @spaces.GPU
45
+ def process_video(video_file, question_parts):
46
+ # Open video and sample frames
47
+ with av.open(video_file.name) as container: # Access file name from Gradio input
48
+ total_frames = container.streams.video[0].frames
49
+ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
50
+ video_clip = read_video_pyav(container, indices)
51
+
52
+ # Combine question parts into a single question
53
+ question = " ".join(question_parts)
54
+
55
+ # Prepare conversation
56
+ conversation = [
57
+ {
58
+ "role": "user",
59
+ "content": [
60
+ {"type": "text", "text": f"{question}"},
61
+ {"type": "video"},
62
+ ],
63
+ },
64
+ ]
65
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
66
+ # Prepare inputs for the model
67
+ input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)
68
+
69
+ # Generate output
70
+ generate_kwargs = {"max_new_tokens": 500, "do_sample": False, "top_p": 0.9}
71
+ output = model.generate(**input, **generate_kwargs)
72
+ generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
73
+
74
+ return generated_text.split("ASSISTANT: ", 1)[-1].strip()
75
+
76
+ @spaces.GPU
77
+ def process_videos(video_files, question_parts):
78
+ """Processes multiple videos and answers a single question for each."""
79
+ answers = []
80
+ for video_file in video_files:
81
+ video_name = os.path.basename(video_file.name)
82
+ answer = process_video(video_file, question_parts)
83
+ answers.append(f"**Video: {video_name}**\n{answer}\n")
84
+ return "\n---\n".join(answers)
85
+
86
+ # Define Gradio interface for multiple videos
87
+ def gradio_interface(videos, indoors_outdoors, standing_sitting, hands_free, interacting_screen):
88
+ question_parts = []
89
+ if indoors_outdoors:
90
+ question_parts.append("Is the subject in the video present indoors or outdoors?")
91
+ if standing_sitting:
92
+ question_parts.append("Is the subject standing or sitting?")
93
+ if hands_free:
94
+ question_parts.append("Is the subject's hands free or not?")
95
+ if interacting_screen:
96
+ question_parts.append("Is the subject interacting with any screen in the background?")
97
+
98
+ answers = process_videos(videos, question_parts)
99
+ return answers
100
+
101
+ iface = gr.Interface(
102
+ fn=gradio_interface,
103
+ inputs=[
104
+ gr.File(label="Upload Videos", file_count="multiple"),
105
+ gr.Checkbox(label="Indoors or Outdoors", value=False),
106
+ gr.Checkbox(label="Standing or Sitting", value=False),
107
+ gr.Checkbox(label="Hands Free or Not", value=False),
108
+ gr.Checkbox(label="Interacting with Screen", value=False),
109
+ ],
110
+ outputs=gr.Textbox(label="Generated Answers"),
111
+ title="Video Question Answering",
112
+ description="Upload multiple videos and select questions to get answers."
113
+ )
114
+
115
+ if __name__ == "__main__":
116
+ iface.launch(debug=True)