Julien Simon commited on
Commit
21828aa
1 Parent(s): aa315eb

Initial version

Browse files
Files changed (3) hide show
  1. app.py +90 -0
  2. requirements.txt +4 -0
  3. video.mp4 +3 -0
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import gradio as gr
3
+ from PIL import Image
4
+ from transformers import (BridgeTowerForImageAndTextRetrieval,
5
+ BridgeTowerProcessor)
6
+
7
+ model_id = "BridgeTower/bridgetower-large-itm-mlm"
8
+ processor = BridgeTowerProcessor.from_pretrained(model_id)
9
+ model = BridgeTowerForImageAndTextRetrieval.from_pretrained(model_id)
10
+
11
+ # Process a frame
12
+ def process_frame(image, texts):
13
+ scores = {}
14
+ texts = texts.split(",")
15
+ for t in texts:
16
+ encoding = processor(image, t, return_tensors="pt")
17
+ outputs = model(**encoding)
18
+ scores[t] = "{:.2f}".format(outputs.logits[0, 1].item())
19
+ # sort scores in descending order
20
+ scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))
21
+ return scores
22
+
23
+
24
+ # Process a video
25
+ def process(video, text, sample_rate, min_score):
26
+ video = cv2.VideoCapture(video)
27
+ fps = round(video.get(cv2.CAP_PROP_FPS))
28
+ frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
29
+ length = frames // fps
30
+ print(f"{fps} fps, {frames} frames, {length} seconds")
31
+
32
+ frame_count = 0
33
+ clips = []
34
+ clip_images = []
35
+ clip_started = False
36
+ while True:
37
+ ret, frame = video.read()
38
+ if not ret:
39
+ break
40
+
41
+ if frame_count % (fps * sample_rate) == 0:
42
+ frame = Image.fromarray(frame)
43
+ scores = process_frame(frame, text)
44
+ # print(f"{frame_count} {scores}")
45
+
46
+ if float(scores[text]) > min_score:
47
+ if clip_started:
48
+ end_time = frame_count / fps
49
+ else:
50
+ clip_started = True
51
+ start_time = frame_count / fps
52
+ end_time = start_time
53
+ clip_images.append(frame)
54
+ elif clip_started:
55
+ clip_started = False
56
+ end_time = frame_count / fps
57
+ clips.append((start_time, end_time))
58
+ frame_count += 1
59
+ return clip_images, clips
60
+
61
+
62
+ # Inputs
63
+ video = gr.Video(label="Video")
64
+ text = gr.Text(label="Text query")
65
+ sample_rate = gr.Number(default=2, label="Sample rate (1 frame every 'n' seconds)")
66
+ min_score = gr.Number(default=3, label="Minimum score")
67
+
68
+ # Output
69
+ gallery = gr.Gallery(label="Images")
70
+ clips = gr.Text(label="Clips")
71
+
72
+ description = "This Space lets you run semantic search on a video."
73
+
74
+ iface = gr.Interface(
75
+ description=description,
76
+ fn=process,
77
+ inputs=[video, text, sample_rate, min_score],
78
+ outputs=[gallery, clips],
79
+ examples=[
80
+ [
81
+ "video.mp4",
82
+ "wild bears",
83
+ 2,
84
+ 3,
85
+ ]
86
+ ],
87
+ allow_flagging="never",
88
+ )
89
+
90
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ torch
3
+ Pillow
4
+ opencv-python
video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a77bb5024f5028f7b95c5a7b6fdf1a6bb2ee787a250f5cd2a12ee96bca970f4
3
+ size 11623402