File size: 4,315 Bytes
021b099
fd2744e
021b099
 
 
fd2744e
021b099
 
 
 
 
fd2744e
 
52636a1
fd2744e
52636a1
 
021b099
 
52636a1
 
021b099
 
 
 
 
52636a1
021b099
 
 
52636a1
830243e
2924167
 
c40e192
 
 
 
 
 
52636a1
c40e192
 
 
52636a1
830243e
c40e192
 
 
 
 
 
52636a1
c40e192
 
 
2924167
 
c40e192
 
 
52636a1
 
 
830243e
 
 
52636a1
c40e192
52636a1
 
c40e192
830243e
 
 
 
 
52636a1
c40e192
fd2744e
830243e
52636a1
830243e
52636a1
 
 
 
 
fd2744e
 
 
 
2924167
fd2744e
 
 
c637e67
830243e
 
fd2744e
 
830243e
52636a1
830243e
fd2744e
2924167
 
 
 
 
fd2744e
c637e67
fd2744e
2924167
fd2744e
c40e192
 
52636a1
c40e192
 
2924167
c40e192
830243e
c40e192
 
 
 
 
 
 
52636a1
c40e192
830243e
 
c40e192
52636a1
c40e192
 
fd2744e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import ffmpeg
import torch
import youtube_dl

import numpy as np
import streamlit as st

from sentence_transformers import SentenceTransformer, util, models
from clip import CLIPModel
from PIL import Image

@st.cache(allow_output_mutation=True, max_entries=1)
def get_model():
    txt_model = SentenceTransformer('clip-ViT-B-32-multilingual-v1').to(dtype=torch.float32, device=torch.device('cpu'))
    clip = CLIPModel()
    vis_model = SentenceTransformer(modules=[clip]).to(dtype=torch.float32, device=torch.device('cpu'))
    return txt_model, vis_model


def get_embedding(txt_model, vis_model, query, video):
    text_emb = txt_model.encode(query, device='cpu')

    # Encode an image:
    images = []
    for img in video:
        images.append(Image.fromarray(img))
    img_embs = vis_model.encode(images, device='cpu')

    return text_emb, img_embs

def find_frames(url, txt_model, vis_model, desc, seconds, top_k):
    text = st.text("Downloading video (Descargando video)...")
    # gif from https://giphy.com/gifs/alan-DfSXiR60W9MVq
    gif_runner = st.image("./loading.gif")
    probe = ffmpeg.probe(url)
    video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
    width = int(video_stream['width'])
    height = int(video_stream['height'])
    out, _ = (
        ffmpeg
        .input(url, t=seconds)
        .output('pipe:', format='rawvideo', pix_fmt='rgb24')
        .run(capture_stdout=True)
    )

    text.text("Processing video (Procesando video)...")
    video = (
        np
        .frombuffer(out, np.uint8)
        .reshape([-1, height, width, 3])
    )[::10]

    txt_embd, img_embds = get_embedding(txt_model, vis_model, desc, video)
    cos_scores = np.array(util.cos_sim(txt_embd, img_embds))
    ids = np.argsort(cos_scores)[0][-top_k:]
    imgs = [Image.fromarray(video[i]) for i in ids]

    gif_runner.empty()
    text.empty()
    st.image(imgs)

with open("HOME.md", "r") as f:
    HOME_PAGE = f.read()

with open("INICIO.md", "r") as f:
    INICIO_PAGINA = f.read()

def main_page(txt_model, vis_model):
    st.title("Introducing Youtube CLIFS")
    
    st.markdown(HOME_PAGE)

def inicio_pagina(txt_model, vis_model):
    st.title("Presentando Youtube CLIFS")
    
    st.markdown(INICIO_PAGINA)

def clifs_page(txt_model, vis_model):
    st.title("CLIFS")

    st.sidebar.markdown("### Controls (Controles):")
    seconds = st.sidebar.slider(
        "How many seconds of video to consider? (¿Cuántos segundos de video considerar?)",
        min_value=10,
        max_value=120,
        value=60,
        step=1,
    )
    top_k = st.sidebar.slider(
        "Top K",
        min_value=1,
        max_value=5,
        value=3,
        step=1,
    )
    desc = st.sidebar.text_input(
        "Search Query (Consulta de Búsqueda)",
        value="Pancake in the shape of an otter",
        help="Text description of what you want to find in the video (Descripción de texto de que desea encontrar en el video)",
    )
    url = st.sidebar.text_input(
        "Youtube Video URL (URL del Video de Youtube)",
        value='https://youtu.be/xUv6XgPwGaQ',
        help="Youtube video you want to search (Video de Youtube que desea búscar)",
    )
    quality = st.sidebar.radio(
        "Quality of the Video (Calidad del Video)",
        [144, 240, 360, 480],
        help="Quality of the video to download. Higher quality takes more time (Calidad del video para descargar. Calidad más alta toma más tiempo)",
    )

    submit_button = st.sidebar.button("Search (Búscar)")
    if submit_button:
        ydl_opts = {"format": f"mp4[height={quality}]"}
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(url, download=False)
            video_url = info_dict.get("url", None)
            find_frames(video_url, txt_model, vis_model, desc, seconds, top_k)

PAGES = {
    "CLIFS": clifs_page,
    "Home": main_page,
    "Inicio": inicio_pagina,
}



def run():
    st.set_page_config(page_title="Youtube CLIFS")
    # main body
    txt_model, vis_model = get_model()

    st.sidebar.title("Navigation (Navegación)")
    selection = st.sidebar.radio("Go to (Ir a)", list(PAGES.keys()))

    page = PAGES[selection](txt_model, vis_model)
    
    


if __name__ == "__main__":
    run()