Spaces:

vivien
/

clip

Running

App Files Files Community

Vivien commited on Apr 27, 2022

Commit

aae8769

1 Parent(s): 5b1c1bd

Add side-by-side comparison of the ViT models

Browse files

Files changed (9) hide show

app.py +96 -47
embeddings-vit-base-patch16.npy +3 -0
embeddings-vit-base-patch32.npy +3 -0
embeddings-vit-large-patch14-336.npy +3 -0
embeddings.npy → embeddings-vit-large-patch14.npy +0 -0
embeddings2-vit-base-patch16.npy +3 -0
embeddings2-vit-base-patch32.npy +3 -0
embeddings2-vit-large-patch14-336.npy +3 -0
embeddings2.npy → embeddings2-vit-large-patch14.npy +0 -0

app.py CHANGED Viewed

@@ -5,38 +5,40 @@ import pandas as pd, numpy as np
 from transformers import CLIPProcessor, CLIPModel
 from st_clickable_images import clickable_images
-@st.cache(
-    show_spinner=False,
-    hash_funcs={
-        CLIPModel: lambda _: None,
-        CLIPProcessor: lambda _: None,
-        dict: lambda _: None,
-    },
-)
 def load():
-    model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
-    processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
     df = {0: pd.read_csv("data.csv"), 1: pd.read_csv("data2.csv")}
-    embeddings = {0: np.load("embeddings.npy"), 1: np.load("embeddings2.npy")}
-    for k in [0, 1]:
-        embeddings[k] = embeddings[k] / np.linalg.norm(
-            embeddings[k], axis=1, keepdims=True
-        )
-    return model, processor, df, embeddings
-model, processor, df, embeddings = load()
 source = {0: "\nSource: Unsplash", 1: "\nSource: The Movie Database (TMDB)"}
-def compute_text_embeddings(list_of_strings):
-    inputs = processor(text=list_of_strings, return_tensors="pt", padding=True)
-    result = model.get_text_features(**inputs).detach().numpy()
     return result / np.linalg.norm(result, axis=1, keepdims=True)
-def image_search(query, corpus, n_results=24):
     positive_embeddings = None
     def concatenate_embeddings(e1, e2):
@@ -57,25 +59,25 @@ def image_search(query, corpus, n_results=24):
                 idx, remainder = int(idx), remainder.strip()
                 k2 = 0 if corpus2 == "Unsplash" else 1
                 positive_embeddings = concatenate_embeddings(
-                    positive_embeddings, embeddings[k2][idx : idx + 1, :]
                 )
                 if len(remainder) > 0:
                     positive_embeddings = concatenate_embeddings(
-                        positive_embeddings, compute_text_embeddings([remainder])
                     )
             else:
                 positive_embeddings = concatenate_embeddings(
-                    positive_embeddings, compute_text_embeddings([positive_query])
                 )
-        dot_product = embeddings[k] @ positive_embeddings.T
         dot_product = dot_product - np.median(dot_product, axis=0)
         dot_product = dot_product / np.max(dot_product, axis=0, keepdims=True)
         dot_product = np.min(dot_product, axis=1)
     if len(splitted_query) > 1:
         negative_queries = (" ".join(splitted_query[1:])).split(";")
-        negative_embeddings = compute_text_embeddings(negative_queries)
-        dot_product2 = embeddings[k] @ negative_embeddings.T
         dot_product2 = dot_product2 - np.median(dot_product2, axis=0)
         dot_product2 = dot_product2 / np.max(dot_product2, axis=0, keepdims=True)
         dot_product -= np.max(np.maximum(dot_product2, 0), axis=1)
@@ -96,7 +98,7 @@ description = """
 **Enter your query and hit enter**
-*Built with OpenAI's [CLIP](https://openai.com/blog/clip/) model, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), 25k images from [Unsplash](https://unsplash.com/) and 8k images from [The Movie Database (TMDB)](https://www.themoviedb.org/)*
 *Inspired by [Unsplash Image Search](https://github.com/haltakov/natural-language-image-search) from Vladimir Haltakov and [Alph, The Sacred River](https://github.com/thoppe/alph-the-sacred-river) from Travis Hoppe*
 """
@@ -107,6 +109,12 @@ howto = """
 - If the input includes "**EXCLUDING**", the part right of it will be used as a negative query
 """
 def main():
     st.markdown(
@@ -124,10 +132,10 @@ def main():
                 margin-left: 5px;
                 margin-right: 5px;
               }
-              section.main>div:first-child {
-                padding-top: 0px;
               }
-              section:not(.main)>div:first-child {
                 padding-top: 30px;
               }
               div.reportview-container > section:first-child{
@@ -145,6 +153,9 @@ def main():
     st.sidebar.markdown(description)
     with st.sidebar.expander("Advanced use"):
         st.markdown(howto)
     _, c, _ = st.columns((1, 3, 1))
     if "query" in st.session_state:
@@ -152,27 +163,65 @@ def main():
     else:
         query = c.text_input("", value="clouds at sunset")
     corpus = st.radio("", ["Unsplash", "Movies"])
     if len(query) > 0:
-        results = image_search(query, corpus)
-        clicked = clickable_images(
-            [result[0] for result in results],
-            titles=[result[1] for result in results],
-            div_style={
-                "display": "flex",
-                "justify-content": "center",
-                "flex-wrap": "wrap",
-            },
-            img_style={"margin": "2px", "height": "200px"},
-        )
-        if clicked >= 0:
             change_query = False
             if "last_clicked" not in st.session_state:
                 change_query = True
             else:
-                if clicked != st.session_state["last_clicked"]:
                     change_query = True
             if change_query:
-                st.session_state["query"] = f"[{corpus}:{results[clicked][2]}]"
                 st.experimental_rerun()

 from transformers import CLIPProcessor, CLIPModel
 from st_clickable_images import clickable_images
+MODEL_NAMES = ["base-patch32", "base-patch16", "large-patch14", "large-patch14-336"]
+@st.cache(show_spinner=False, hash_funcs={dict: lambda _: None})
 def load():
     df = {0: pd.read_csv("data.csv"), 1: pd.read_csv("data2.csv")}
+    models = {}
+    processors = {}
+    embeddings = {}
+    for name in MODEL_NAMES:
+        models[name] = CLIPModel.from_pretrained(f"openai/clip-vit-{name}")
+        processors[name] = CLIPProcessor.from_pretrained(f"openai/clip-vit-{name}")
+        embeddings[name] = {
+            0: np.load(f"embeddings-vit-{name}.npy"),
+            1: np.load(f"embeddings2-vit-{name}.npy"),
+        }
+        for k in [0, 1]:
+            embeddings[name][k] = embeddings[name][k] / np.linalg.norm(
+                embeddings[name][k], axis=1, keepdims=True
+            )
+    return models, processors, df, embeddings
+models, processors, df, embeddings = load()
 source = {0: "\nSource: Unsplash", 1: "\nSource: The Movie Database (TMDB)"}
+def compute_text_embeddings(list_of_strings, name):
+    inputs = processors[name](text=list_of_strings, return_tensors="pt", padding=True)
+    result = models[name].get_text_features(**inputs).detach().numpy()
     return result / np.linalg.norm(result, axis=1, keepdims=True)
+def image_search(query, corpus, name, n_results=24):
     positive_embeddings = None
     def concatenate_embeddings(e1, e2):
                 idx, remainder = int(idx), remainder.strip()
                 k2 = 0 if corpus2 == "Unsplash" else 1
                 positive_embeddings = concatenate_embeddings(
+                    positive_embeddings, embeddings[name][k2][idx : idx + 1, :]
                 )
                 if len(remainder) > 0:
                     positive_embeddings = concatenate_embeddings(
+                        positive_embeddings, compute_text_embeddings([remainder], name)
                     )
             else:
                 positive_embeddings = concatenate_embeddings(
+                    positive_embeddings, compute_text_embeddings([positive_query], name)
                 )
+        dot_product = embeddings[name][k] @ positive_embeddings.T
         dot_product = dot_product - np.median(dot_product, axis=0)
         dot_product = dot_product / np.max(dot_product, axis=0, keepdims=True)
         dot_product = np.min(dot_product, axis=1)
     if len(splitted_query) > 1:
         negative_queries = (" ".join(splitted_query[1:])).split(";")
+        negative_embeddings = compute_text_embeddings(negative_queries, name)
+        dot_product2 = embeddings[name][k] @ negative_embeddings.T
         dot_product2 = dot_product2 - np.median(dot_product2, axis=0)
         dot_product2 = dot_product2 / np.max(dot_product2, axis=0, keepdims=True)
         dot_product -= np.max(np.maximum(dot_product2, 0), axis=1)
 **Enter your query and hit enter**
+*Built with OpenAI's [CLIP](https://openai.com/blog/clip/) models, 🤗 Hugging Face's [transformers library](https://huggingface.co/transformers/), [Streamlit](https://streamlit.io/), 25k images from [Unsplash](https://unsplash.com/) and 8k images from [The Movie Database (TMDB)](https://www.themoviedb.org/)*
 *Inspired by [Unsplash Image Search](https://github.com/haltakov/natural-language-image-search) from Vladimir Haltakov and [Alph, The Sacred River](https://github.com/thoppe/alph-the-sacred-river) from Travis Hoppe*
 """
 - If the input includes "**EXCLUDING**", the part right of it will be used as a negative query
 """
+div_style = {
+    "display": "flex",
+    "justify-content": "center",
+    "flex-wrap": "wrap",
+}
 def main():
     st.markdown(
                 margin-left: 5px;
                 margin-right: 5px;
               }
+              .row-widget {
+                margin-top: -25px;
               }
+              section>div:first-child {
                 padding-top: 30px;
               }
               div.reportview-container > section:first-child{
     st.sidebar.markdown(description)
     with st.sidebar.expander("Advanced use"):
         st.markdown(howto)
+    mode = st.sidebar.selectbox(
+        "", ["Results for ViT-L/14@336px", "Comparison of 2 models"], index=0
+    )
     _, c, _ = st.columns((1, 3, 1))
     if "query" in st.session_state:
     else:
         query = c.text_input("", value="clouds at sunset")
     corpus = st.radio("", ["Unsplash", "Movies"])
+    models_dict = {
+        "ViT-B/32 (quickest)": "base-patch32",
+        "ViT-B/16 (quick)": "base-patch16",
+        "ViT-L/14 (slow)": "large-patch14",
+        "ViT-L/14@336px (slowest)": "large-patch14-336",
+    }
+    if "Comparison" in mode:
+        c1, c2 = st.columns((1, 1))
+        selection1 = c1.selectbox("", models_dict.keys(), index=0)
+        selection2 = c2.selectbox("", models_dict.keys(), index=3)
+        name1 = models_dict[selection1]
+        name2 = models_dict[selection2]
+    else:
+        name1 = MODEL_NAMES[-1]
     if len(query) > 0:
+        results1 = image_search(query, corpus, name1)
+        if "Comparison" in mode:
+            with c1:
+                clicked1 = clickable_images(
+                    [result[0] for result in results1],
+                    titles=[result[1] for result in results1],
+                    div_style=div_style,
+                    img_style={"margin": "2px", "height": "150px"},
+                    key=query + corpus + name1 + "1",
+                )
+            results2 = image_search(query, corpus, name2)
+            with c2:
+                clicked2 = clickable_images(
+                    [result[0] for result in results2],
+                    titles=[result[1] for result in results2],
+                    div_style=div_style,
+                    img_style={"margin": "2px", "height": "150px"},
+                    key=query + corpus + name2 + "2",
+                )
+        else:
+            clicked1 = clickable_images(
+                [result[0] for result in results1],
+                titles=[result[1] for result in results1],
+                div_style=div_style,
+                img_style={"margin": "2px", "height": "200px"},
+                key=query + corpus + name1 + "1",
+            )
+            clicked2 = -1
+        if clicked2 >= 0 or clicked1 >= 0:
             change_query = False
             if "last_clicked" not in st.session_state:
                 change_query = True
             else:
+                if max(clicked2, clicked1) != st.session_state["last_clicked"]:
                     change_query = True
             if change_query:
+                if clicked1 >= 0:
+                    st.session_state["query"] = f"[{corpus}:{results1[clicked1][2]}]"
+                elif clicked2 >= 0:
+                    st.session_state["query"] = f"[{corpus}:{results2[clicked2][2]}]"
                 st.experimental_rerun()

embeddings-vit-base-patch16.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:125430e11a4a415ec0c0fc5339f97544f0447e4b0a24c20f2e59f8852e706afc
+size 51200128

embeddings-vit-base-patch32.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f7ebdff24079665faf58d07045056a63b5499753e3ffbda479691d53de3ab38
+size 51200128

embeddings-vit-large-patch14-336.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f79f10ebe267b4ee7acd553dfe0ee31df846123630058a6d58c04bf22e0ad068
+size 76800128

embeddings.npy → embeddings-vit-large-patch14.npy RENAMED Viewed

File without changes

embeddings2-vit-base-patch16.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:153cf3fae2385d51fe8729d3a1c059f611ca47a3fc501049708114d1bbf79049
+size 16732288

embeddings2-vit-base-patch32.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7d545bed86121dac1cedcc1de61ea5295f5840c1eb751637e6628ac54faef81
+size 16732288

embeddings2-vit-large-patch14-336.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e66eb377465fbfaa56cec079aa3e214533ceac43646f2ca78028ae4d8ad6d03
+size 25098368

embeddings2.npy → embeddings2-vit-large-patch14.npy RENAMED Viewed

File without changes