Spaces:

vivien
/

clip

Running

File size: 8,681 Bytes

5185219
 
c81898a
 
0779f15
b59b1d0
5185219
c81898a
9033a8d
0779f15
 
 
3e50eda
 
a55de09
aae8769
fbe7708
c81898a
a55de09
aae8769
 
 
 
0779f15
aae8769
 
 
 
 
 
 
 
 
 
 
 
 
a55de09
 
c81898a
aae8769
 
0779f15
 
5185219
a55de09
 
aae8769
5185219
 
 
 
 
 
 
 
5b1c1bd
 
 
 
 
 
 
 
 
 
 
5185219
aae8769
5185219
5b1c1bd
 
aae8769
5b1c1bd
 
 
aae8769
5b1c1bd
aae8769
5b1c1bd
 
 
5185219
 
 
aae8769
 
5b1c1bd
 
 
5185219
 
a55de09
 
 
 
5185219
a55de09
 
 
 
c81898a
a55de09
c81898a
ff968d5
 
 
ee0cebf
ba03fb2
9ea8c8c
a55de09
 
74074fa
 
 
7a848b2
74074fa
 
aae8769
 
 
 
 
 
c81898a
 
a55de09
 
c81898a
 
 
 
ff968d5
 
 
 
 
 
 
 
c81898a
aae8769
 
555584f
aae8769
555584f
7600dc3
586f7e5
55fea56
586f7e5
c81898a
 
 
 
 
 
a55de09
 
 
 
74074fa
 
0779f15
3e50eda
0779f15
74074fa
a55de09
5185219
 
 
 
a55de09
aae8769
 
9033a8d
 
 
 
aae8769
 
0779f15
aae8769
 
8d49b55
aae8769
 
 
 
 
a55de09
aae8769
0779f15
aae8769
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5185219
 
 
 
aae8769
5185219
 
aae8769
 
0779f15
3e50eda
5185219
a55de09

from html import escape
import re
import streamlit as st
import pandas as pd, numpy as np
import torch
from transformers import CLIPProcessor, CLIPModel
from st_clickable_images import clickable_images

MODEL_NAMES = [
    #    "base-patch32",
    #    "base-patch16",
    #    "large-patch14",
    "large-patch14-336"
]


@st.cache(allow_output_mutation=True)
def load():
    df = {0: pd.read_csv("data.csv"), 1: pd.read_csv("data2.csv")}
    models = {}
    processors = {}
    embeddings = {}
    for name in MODEL_NAMES:
        models[name] = CLIPModel.from_pretrained(f"openai/clip-vit-{name}").eval()
        processors[name] = CLIPProcessor.from_pretrained(f"openai/clip-vit-{name}")
        embeddings[name] = {
            0: np.load(f"embeddings-vit-{name}.npy"),
            1: np.load(f"embeddings2-vit-{name}.npy"),
        }
        for k in [0, 1]:
            embeddings[name][k] = embeddings[name][k] / np.linalg.norm(
                embeddings[name][k], axis=1, keepdims=True
            )
    return models, processors, df, embeddings


models, processors, df, embeddings = load()
source = {0: "\nSource: Unsplash", 1: "\nSource: The Movie Database (TMDB)"}


def compute_text_embeddings(list_of_strings, name):
    inputs = processors[name](text=list_of_strings, return_tensors="pt", padding=True)
    with torch.no_grad():
        result = models[name].get_text_features(**inputs).detach().numpy()
    return result / np.linalg.norm(result, axis=1, keepdims=True)


def image_search(query, corpus, name, n_results=24):
    positive_embeddings = None

    def concatenate_embeddings(e1, e2):
        if e1 is None:
            return e2
        else:
            return np.concatenate((e1, e2), axis=0)

    splitted_query = query.split("EXCLUDING ")
    dot_product = 0
    k = 0 if corpus == "Unsplash" else 1
    if len(splitted_query[0]) > 0:
        positive_queries = splitted_query[0].split(";")
        for positive_query in positive_queries:
            match = re.match(r"\[(Movies|Unsplash):(\d{1,5})\](.*)", positive_query)
            if match:
                corpus2, idx, remainder = match.groups()
                idx, remainder = int(idx), remainder.strip()
                k2 = 0 if corpus2 == "Unsplash" else 1
                positive_embeddings = concatenate_embeddings(
                    positive_embeddings, embeddings[name][k2][idx : idx + 1, :]
                )
                if len(remainder) > 0:
                    positive_embeddings = concatenate_embeddings(
                        positive_embeddings, compute_text_embeddings([remainder], name)
                    )
            else:
                positive_embeddings = concatenate_embeddings(
                    positive_embeddings, compute_text_embeddings([positive_query], name)
                )
        dot_product = embeddings[name][k] @ positive_embeddings.T
        dot_product = dot_product - np.median(dot_product, axis=0)
        dot_product = dot_product / np.max(dot_product, axis=0, keepdims=True)
        dot_product = np.min(dot_product, axis=1)

    if len(splitted_query) > 1:
        negative_queries = (" ".join(splitted_query[1:])).split(";")
        negative_embeddings = compute_text_embeddings(negative_queries, name)
        dot_product2 = embeddings[name][k] @ negative_embeddings.T
        dot_product2 = dot_product2 - np.median(dot_product2, axis=0)
        dot_product2 = dot_product2 / np.max(dot_product2, axis=0, keepdims=True)
        dot_product -= np.max(np.maximum(dot_product2, 0), axis=1)

    results = np.argsort(dot_product)[-1 : -n_results - 1 : -1]
    return [
        (
            df[k].iloc[i]["path"],
            df[k].iloc[i]["tooltip"] + source[k],
            i,
        )
        for i in results
    ]


description = """
# Semantic image search

**Enter your query and hit enter**

*Built with OpenAI's [CLIP](https://openai.com/blog/clip/) model, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), 25k images from [Unsplash](https://unsplash.com/) and 8k images from [The Movie Database (TMDB)](https://www.themoviedb.org/)*

*Inspired by [Unsplash Image Search](https://github.com/haltakov/natural-language-image-search) from Vladimir Haltakov and [Alph, The Sacred River](https://github.com/thoppe/alph-the-sacred-river) from Travis Hoppe*
"""

howto = """
- Click on an image to use it as a query and find similar images
- Several queries, including one based on an image, can be combined (use "**;**" as a separator)
- If the input includes "**EXCLUDING**", the part right of it will be used as a negative query
"""

div_style = {
    "display": "flex",
    "justify-content": "center",
    "flex-wrap": "wrap",
}


def main():
    st.markdown(
        """
              <style>
              .block-container{
                max-width: 1200px;
              }
              div.row-widget.stRadio > div{
                flex-direction:row;
                display: flex;
                justify-content: center;
              }
              div.row-widget.stRadio > div > label{
                margin-left: 5px;
                margin-right: 5px;
              }
              .row-widget {
                margin-top: -25px;
              }
              section>div:first-child {
                padding-top: 30px;
              }
              div.reportview-container > section:first-child{
                max-width: 320px;
              }
              #MainMenu {
                visibility: hidden;
              }
              footer {
                visibility: hidden;
              }
              </style>""",
        unsafe_allow_html=True,
    )
    st.sidebar.markdown(description)
    with st.sidebar.expander("Advanced use"):
        st.markdown(howto)
    # mode = st.sidebar.selectbox(
    #    "", ["Results for ViT-L/14@336px", "Comparison of 2 models"], index=0
    # )

    _, c, _ = st.columns((1, 3, 1))
    if "query" in st.session_state:
        query = c.text_input("", value=st.session_state["query"])
    else:
        query = c.text_input("", value="clouds at sunset")
    corpus = st.radio("", ["Unsplash", "Movies"])

    models_dict = {
        "ViT-B/32 (quicker)": "base-patch32",
        "ViT-B/16 (average)": "base-patch16",
        # "ViT-L/14 (slow)": "large-patch14",
        "ViT-L/14@336px (slower)": "large-patch14-336",
    }

    if False:  # "Comparison" in mode:
        c1, c2 = st.columns((1, 1))
        selection1 = c1.selectbox("", models_dict.keys(), index=0)
        selection2 = c2.selectbox("", models_dict.keys(), index=2)
        name1 = models_dict[selection1]
        name2 = models_dict[selection2]
    else:
        name1 = MODEL_NAMES[-1]

    if len(query) > 0:
        results1 = image_search(query, corpus, name1)
        if False:  # "Comparison" in mode:
            with c1:
                clicked1 = clickable_images(
                    [result[0] for result in results1],
                    titles=[result[1] for result in results1],
                    div_style=div_style,
                    img_style={"margin": "2px", "height": "150px"},
                    key=query + corpus + name1 + "1",
                )
            results2 = image_search(query, corpus, name2)
            with c2:
                clicked2 = clickable_images(
                    [result[0] for result in results2],
                    titles=[result[1] for result in results2],
                    div_style=div_style,
                    img_style={"margin": "2px", "height": "150px"},
                    key=query + corpus + name2 + "2",
                )
        else:
            clicked1 = clickable_images(
                [result[0] for result in results1],
                titles=[result[1] for result in results1],
                div_style=div_style,
                img_style={"margin": "2px", "height": "200px"},
                key=query + corpus + name1 + "1",
            )
            clicked2 = -1

        if clicked2 >= 0 or clicked1 >= 0:
            change_query = False
            if "last_clicked" not in st.session_state:
                change_query = True
            else:
                if max(clicked2, clicked1) != st.session_state["last_clicked"]:
                    change_query = True
            if change_query:
                if clicked1 >= 0:
                    st.session_state["query"] = f"[{corpus}:{results1[clicked1][2]}]"
                # elif clicked2 >= 0:
                #    st.session_state["query"] = f"[{corpus}:{results2[clicked2][2]}]"
                st.experimental_rerun()


if __name__ == "__main__":
    main()