Amy Roberts commited on
Commit
4707818
1 Parent(s): f2b92aa

Cache examples

Browse files
Files changed (2) hide show
  1. app.py +27 -13
  2. gradio_cached_examples/15/log.csv +4 -0
app.py CHANGED
@@ -76,7 +76,7 @@ def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps
76
 
77
 
78
  def get_video_duration(filename):
79
- cap = cv2.VideoCapture(filename)
80
  if cap.isOpened():
81
  rate = cap.get(5)
82
  frame_num = cap.get(7)
@@ -85,13 +85,21 @@ def get_video_duration(filename):
85
  return -1
86
 
87
 
 
 
 
 
 
 
88
  def predict_durations(model_checkpoint, text, video_filename, device="cpu"):
89
  print(f"Loading model: {model_checkpoint}")
90
  model = TvpForVideoGrounding.from_pretrained(model_checkpoint)
91
  processor = AutoProcessor.from_pretrained(model_checkpoint)
92
  print(f"Loading video: {video_filename}")
 
93
  raw_sampled_frames = decode(
94
- container=av.open(video_filename, metadata_errors="ignore"),
 
95
  sampling_rate=1,
96
  num_frames=model.config.num_frames,
97
  clip_idx=0,
@@ -114,15 +122,16 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
114
  DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
115
  MODELS = ["Intel/tvp-base", "Intel/tvp-base-ANet"]
116
  EXAMPLES = [
117
- ["./examples/bed.mp4", "a person is sitting on a bed."],
118
- ["./examples/food.mp4", "a person eats some food."],
119
- ["./examples/book.mp4", "a person reads a book."],
120
  ]
121
 
122
  model_checkpoint = gr.Dropdown(MODELS, label="Model", value=MODELS[0], type="value")
123
  video_in = gr.Video(label="Video File", elem_id="video_in")
124
- text_in = gr.Textbox(label="Text", placeholder="Description of event in the video", interactive=True)
125
- text_out = gr.Textbox(label="Prediction", placeholder="Predicted start and end time")
 
126
 
127
 
128
  title = "Video Grounding with TVP"
@@ -131,20 +140,25 @@ css = """.toast-wrap { display: none !important } """
131
  with gr.Blocks(title=title) as demo:
132
  gr.Markdown(DESCRIPTION)
133
  with gr.Row():
134
- model_checkpoint.render()
135
-
136
- with gr.Row():
137
- examples = gr.Examples(examples=EXAMPLES, inputs=[video_in, text_in])
138
 
139
  with gr.Row():
140
  with gr.Column():
141
  video_in.render()
142
 
143
  with gr.Column():
144
- text_in.render()
 
 
145
  time_button = gr.Button("Get start and end time")
146
  time_button.click(predict_durations, inputs=[model_checkpoint, text_in, video_in], outputs=[text_out])
147
- text_out.render()
 
 
 
 
 
148
 
149
 
150
  if __name__ == "__main__":
 
76
 
77
 
78
  def get_video_duration(filename):
79
+ cap = cv2.VideoCapture(_extract_video_filepath(filename))
80
  if cap.isOpened():
81
  rate = cap.get(5)
82
  frame_num = cap.get(7)
 
85
  return -1
86
 
87
 
88
+ def _extract_video_filepath(video_filename):
89
+ if isinstance(video_filename, dict):
90
+ return video_filename['video']['path']
91
+ return video_filename
92
+
93
+
94
  def predict_durations(model_checkpoint, text, video_filename, device="cpu"):
95
  print(f"Loading model: {model_checkpoint}")
96
  model = TvpForVideoGrounding.from_pretrained(model_checkpoint)
97
  processor = AutoProcessor.from_pretrained(model_checkpoint)
98
  print(f"Loading video: {video_filename}")
99
+ filepath = video_filename['video']['path'] if isinstance(video_filename, dict) else video_filename
100
  raw_sampled_frames = decode(
101
+ container=av.open(_extract_video_filepath(video_filename), metadata_errors="ignore"),
102
+ # container=av.open(video_filename['path'], metadata_errors="ignore"),
103
  sampling_rate=1,
104
  num_frames=model.config.num_frames,
105
  clip_idx=0,
 
122
  DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
123
  MODELS = ["Intel/tvp-base", "Intel/tvp-base-ANet"]
124
  EXAMPLES = [
125
+ ["Intel/tvp-base", "a person is sitting on a bed.", "./examples/bed.mp4", ],
126
+ ["Intel/tvp-base", "a person eats some food.", "./examples/food.mp4", ],
127
+ ["Intel/tvp-base", "a person reads a book.", "./examples/book.mp4", ],
128
  ]
129
 
130
  model_checkpoint = gr.Dropdown(MODELS, label="Model", value=MODELS[0], type="value")
131
  video_in = gr.Video(label="Video File", elem_id="video_in")
132
+ # text_in = gr.Textbox(label="Text", placeholder="Description of event in the video", interactive=True)
133
+ # text_out = gr.Textbox(label="Prediction", placeholder="Predicted start and end time")
134
+ # examples = gr.Examples(examples=EXAMPLES, fn=predict_durations, inputs=[model_checkpoint, text_in, video_in], outputs=[text_out], cache_examples=True, preprocess=False)
135
 
136
 
137
  title = "Video Grounding with TVP"
 
140
  with gr.Blocks(title=title) as demo:
141
  gr.Markdown(DESCRIPTION)
142
  with gr.Row():
143
+ model_checkpoint = gr.Dropdown(MODELS, label="Model", value=MODELS[0], type="value")
144
+ # model_checkpoint.render()
 
 
145
 
146
  with gr.Row():
147
  with gr.Column():
148
  video_in.render()
149
 
150
  with gr.Column():
151
+ text_in = gr.Textbox(label="Text", placeholder="Description of event in the video", interactive=True)
152
+ text_out = gr.Textbox(label="Prediction", placeholder="Predicted start and end time")
153
+ # text_in #.render()
154
  time_button = gr.Button("Get start and end time")
155
  time_button.click(predict_durations, inputs=[model_checkpoint, text_in, video_in], outputs=[text_out])
156
+ # text_out #.render()
157
+
158
+ with gr.Row():
159
+ examples = gr.Examples(examples=EXAMPLES, fn=predict_durations, inputs=[model_checkpoint, text_in, video_in], outputs=[text_out], cache_examples=True, preprocess=False)
160
+ # examples.render()
161
+ # text_out.render()
162
 
163
 
164
  if __name__ == "__main__":
gradio_cached_examples/15/log.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Prediction,flag,username,timestamp
2
+ "start: 0.0s, end: 6.8s",,,2023-11-22 15:49:43.930614
3
+ "start: 0.0s, end: 11.4s",,,2023-11-22 15:50:00.348291
4
+ "start: 0.0s, end: 5.6s",,,2023-11-22 15:50:16.641556