Spaces:

juliensimon
/

bridgetower-video-search

Runtime error

App Files Files Community

Julien Simon commited on Feb 6, 2023

Commit

21828aa

•

1 Parent(s): aa315eb

Initial version

Browse files

Files changed (3) hide show

app.py +90 -0
requirements.txt +4 -0
video.mp4 +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import cv2
+import gradio as gr
+from PIL import Image
+from transformers import (BridgeTowerForImageAndTextRetrieval,
+                          BridgeTowerProcessor)
+model_id = "BridgeTower/bridgetower-large-itm-mlm"
+processor = BridgeTowerProcessor.from_pretrained(model_id)
+model = BridgeTowerForImageAndTextRetrieval.from_pretrained(model_id)
+# Process a frame
+def process_frame(image, texts):
+    scores = {}
+    texts = texts.split(",")
+    for t in texts:
+        encoding = processor(image, t, return_tensors="pt")
+        outputs = model(**encoding)
+        scores[t] = "{:.2f}".format(outputs.logits[0, 1].item())
+        # sort scores in descending order
+        scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))
+    return scores
+# Process a video
+def process(video, text, sample_rate, min_score):
+    video = cv2.VideoCapture(video)
+    fps = round(video.get(cv2.CAP_PROP_FPS))
+    frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    length = frames // fps
+    print(f"{fps} fps, {frames} frames, {length} seconds")
+    frame_count = 0
+    clips = []
+    clip_images = []
+    clip_started = False
+    while True:
+        ret, frame = video.read()
+        if not ret:
+            break
+        if frame_count % (fps * sample_rate) == 0:
+            frame = Image.fromarray(frame)
+            scores = process_frame(frame, text)
+            # print(f"{frame_count} {scores}")
+            if float(scores[text]) > min_score:
+                if clip_started:
+                    end_time = frame_count / fps
+                else:
+                    clip_started = True
+                    start_time = frame_count / fps
+                    end_time = start_time
+                    clip_images.append(frame)
+            elif clip_started:
+                clip_started = False
+                end_time = frame_count / fps
+                clips.append((start_time, end_time))
+        frame_count += 1
+    return clip_images, clips
+# Inputs
+video = gr.Video(label="Video")
+text = gr.Text(label="Text query")
+sample_rate = gr.Number(default=2, label="Sample rate (1 frame every 'n' seconds)")
+min_score = gr.Number(default=3, label="Minimum score")
+# Output
+gallery = gr.Gallery(label="Images")
+clips = gr.Text(label="Clips")
+description = "This Space lets you run semantic search on a video."
+iface = gr.Interface(
+    description=description,
+    fn=process,
+    inputs=[video, text, sample_rate, min_score],
+    outputs=[gallery, clips],
+    examples=[
+        [
+            "video.mp4",
+            "wild bears",
+            2,
+            3,
+        ]
+    ],
+    allow_flagging="never",
+)
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+git+https://github.com/huggingface/transformers
+torch
+Pillow
+opencv-python

video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a77bb5024f5028f7b95c5a7b6fdf1a6bb2ee787a250f5cd2a12ee96bca970f4
+size 11623402