File size: 6,078 Bytes
06499c0
 
 
 
 
 
 
 
 
 
 
 
3f1f1a2
06499c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f1f1a2
 
06499c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f1f1a2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gradio as gr
import torch
import numpy as np
from transformers import OwlViTProcessor, OwlViTForObjectDetection, ResNetModel
from torchvision import transforms
from PIL import Image
import cv2
import torch.nn.functional as F
import tempfile
import os

# Load models
resnet = ResNetModel.from_pretrained("microsoft/resnet-50")
resnet.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = resnet.to(device)

mixin = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = mixin.to(device)

# Preprocess the image
def preprocess_image(image):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return transform(image).unsqueeze(0)

def extract_embedding(image):
    image_tensor = preprocess_image(image).to(device)
    with torch.no_grad():
        output = resnet(image_tensor)
        embedding = output.pooler_output
    return embedding

def cosine_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1, embedding2)

def l2_distance(embedding1, embedding2):
    return torch.norm(embedding1 - embedding2, p=2)

def save_array_to_temp_image(arr):
    rgb_arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(rgb_arr)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
    temp_file_name = temp_file.name
    temp_file.close()
    img.save(temp_file_name)
    return temp_file_name

def detect_and_crop(target_image, query_image, threshold=0.6, nms_threshold=0.3):
    target_sizes = torch.Tensor([target_image.size[::-1]])
    inputs = processor(images=target_image, query_images=query_image, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.image_guided_detection(**inputs)
    
    img = cv2.cvtColor(np.array(target_image), cv2.COLOR_BGR2RGB)
    outputs.logits = outputs.logits.cpu()
    outputs.target_pred_boxes = outputs.target_pred_boxes.cpu()
    
    results = processor.post_process_image_guided_detection(outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes)
    boxes, scores = results[0]["boxes"], results[0]["scores"]

    if len(boxes) == 0:
        return []

    filtered_boxes = []
    for box in boxes:
        x1, y1, x2, y2 = [int(i) for i in box.tolist()]
        cropped_img = img[y1:y2, x1:x2]
        if cropped_img.size != 0:
            filtered_boxes.append(cropped_img)

    return filtered_boxes

def process_video(video_path, query_image, skipframes=0):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return

    frame_count = 0
    all_results = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % (skipframes + 1) == 0:
            frame_file = save_array_to_temp_image(frame)
            result_frames = detect_and_crop(Image.open(frame_file), query_image)
            for res in result_frames:
                saved_res = save_array_to_temp_image(res)
                embedding1 = extract_embedding(query_image)
                embedding2 = extract_embedding(Image.open(saved_res))
                dist = l2_distance(embedding1, embedding2).item()
                cos = cosine_similarity(embedding1, embedding2).item()
                all_results.append({'l2_dist': dist, 'cos': cos})
        frame_count += 1
    cap.release()
    return all_results

def process_videos_and_compare(image, video, skipframes=5, threshold=0.47):
    def median(values):
        n = len(values)
        return (values[n // 2 - 1] + values[n // 2]) / 2 if n % 2 == 0 else values[n // 2]

    results = process_video(video, image, skipframes)
    if results:
        l2_dists = [item['l2_dist'] for item in results]
        cosines = [item['cos'] for item in results]
        avg_l2_dist = sum(l2_dists) / len(l2_dists)
        avg_cos = sum(cosines) / len(cosines)
        median_l2_dist = median(sorted(l2_dists))
        median_cos = median(sorted(cosines))
        result = {
            "avg_l2_dist": avg_l2_dist,
            "avg_cos": avg_cos,
            "median_l2_dist": median_l2_dist,
            "median_cos": median_cos,
            "avg_cos_dist": 1 - avg_cos,
            "median_cos_dist": 1 - median_cos,
            "is_present": avg_cos >= threshold
        }
    else:
        result = {
            "avg_l2_dist": float('inf'),
            "avg_cos": 0,
            "median_l2_dist": float('inf'),
            "median_cos": 0,
            "avg_cos_dist": float('inf'),
            "median_cos_dist": float('inf'),
            "is_present": False
        }
    return result

def interface(video, image, skipframes, threshold):
    result = process_videos_and_compare(image, video, skipframes, threshold)
    return result

iface = gr.Interface(
    fn=interface,
    inputs=[
        gr.Video(label="Upload a Video"),
        gr.Image(type="pil", label="Upload a Query Image"),
        gr.Slider(minimum=0, maximum=10, step=1, value=5, label="Skip Frames"),
        gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.47, label="Threshold")
    ],
    outputs=[
        gr.JSON(label="Result")
    ],
    title="Object Detection in Video",
    description="""
    **Instructions:**

    1. **Upload a Video**: Select a video file to upload. 
    2. **Upload a Query Image**: Select an image file that contains the object you want to detect in the video.
    3. **Set Skip Frames**: Adjust the slider to set the number of frames to skip between each processing.
    4. **Set Threshold**: Adjust the slider to set the threshold for cosine similarity to determine if the object is present in the video.
    5. **View Results**: The result will show the average and median distances and similarities, and whether the object is present in the video based on the threshold.
    """
)

if __name__ == "__main__":
    iface.launch()