Spaces:

pinecone
/

semantic-query-trainer

Runtime error

App Files Files Community

jamescalam commited on Aug 13, 2022

Commit

88172be

•

1 Parent(s): 814d271

upgrade to contrastive learning and unsplash lite dataset

Browse files

Files changed (3) hide show

app.py +120 -53
link-check.py +58 -0
unsplash-25k-clip-indexer.ipynb +775 -0

app.py CHANGED Viewed

@@ -11,19 +11,10 @@ import logging
 from urllib3.exceptions import ProtocolError
 PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]  # app.pinecone.io
-INDEX = "imagenet-query-trainer-clip"
 MODEL_ID = "openai/clip-vit-base-patch32"
 DIMS = 512
-@st.experimental_singleton(show_spinner=False)
-def init_dataset():
-    return load_dataset(
-        'frgfm/imagenette',
-        'full_size',
-        split='train',
-        ignore_verifications=False  # set to True if seeing splits Error
-    )
 @st.experimental_singleton(show_spinner=False)
 def init_clip():
     tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID)
@@ -39,7 +30,12 @@ def init_db():
     meta_field = datetime.now().isoformat()
     return meta_field, pinecone.Index(INDEX)
-def query(xq, top_k=10, include_values=True, filter=None):
     logging.info(f"Query to Pinecone with '{st.session_state.meta}'")
     attempt = 0
     while attempt < 3:
@@ -48,14 +44,15 @@ def query(xq, top_k=10, include_values=True, filter=None):
                 xq,
                 top_k=top_k,
                 include_values=include_values,
                 filter=filter
             )
-            matches = {match['id']: match['values'] for match in xc['matches']}
             break
         except ProtocolError:
             attempt += 1
-            matches = {}
-    if len(matches.keys()) == 0:
         logging.error(f"No matches found for '{st.session_state.meta}'")
     return matches
@@ -108,44 +105,34 @@ def pil_to_bytes(img):
         img_bin = base64.b64encode(img_bin).decode('utf-8')
     return img_bin
-def card(i):
-    img = imagenet[int(i)]['image']
-    img_bin = pil_to_bytes(img)
-    return f'<img id="img{i}" src="data:image/jpeg;base64,{img_bin}" width="200px;">'
-def get_top_k(xq, top_k=10):
     matches = query(
-        xq, top_k=top_k, include_values=True, filter={st.session_state.meta: {"$ne": 1}}
     )
     return matches
-def tune(matches, inputs, iters=5):
-    positive_idx = [idx for idx, val in inputs.items() if val == 1]
-    negatives = [match for match in matches.items() if match[0] not in positive_idx]
-    negative_idx = [match[0] for match in negatives]
-    negative_vectors = [match[1] for match in negatives]
-    positive_vectors = [match[1] for match in matches.items() if match[0] in positive_idx]
-    # prep training data
-    y = [1] * len(positive_idx) + [0] * len(negative_idx)
-    X = positive_vectors + negative_vectors
     # train the classifier
     st.session_state.clf.fit(X, y, iters=iters)
     # extract new vector
     st.session_state.xq = st.session_state.clf.get_weights()
-    # update one record at a time
-    for i in positive_idx + negative_idx:
-        st.session_state.index.update(str(i), set_metadata={st.session_state.meta: 1})
 def refresh_index():
     logging.info(f"Refresh for '{st.session_state.meta}'")
     xq = st.session_state.xq
     if type(xq) is not list:
         xq = xq.tolist()
     while True:
         matches = query(xq, top_k=100, filter={st.session_state.meta: 1})
-        idx = list(matches.keys())
-        if len(idx) == 0: break
-        for i in idx:
             st.session_state.index.update(str(i), set_metadata={st.session_state.meta: 0})
     # refresh session states
     del st.session_state.clf, st.session_state.xq
@@ -156,19 +143,26 @@ def calc_dist():
     return np.linalg.norm(xq - orig_xq)
 def submit():
     matches = st.session_state.matches
-    velocity = st.session_state.velocity
-    inputs = {}
     states = [
         st.session_state[f"input{i}"] for i in range(len(matches))
     ]
-    for i, idx in enumerate(matches.keys()):
-        inputs[idx] = int(states[i])
         states[i] = False
     # reset states to unchecked
     for i in range(len(matches)):
         st.session_state[f"input{i}"] = False
-    tune(matches, inputs, iters=velocity)
 def delete_element(element):
     del element
@@ -180,18 +174,72 @@ st.markdown("""
 />
 """, unsafe_allow_html=True)
 with st.spinner("Initializing everything..."):
-    imagenet = init_dataset()
     st.session_state.meta, st.session_state.index = init_db()
     if 'xq' not in st.session_state:
         tokenizer, clip = init_clip()
 if 'xq' not in st.session_state:
-    start = [st.empty(), st.empty(), st.empty(), st.empty()]
-    prompt = start[0].text_input("Prompt:", value="")
-    prompt_xq = start[1].button("Prompt", disabled=len(prompt) == 0)
-    random_xq = start[2].button("Random", disabled=len(prompt) != 0)
-    start[3].markdown('Not sure what to write? Try **"dogs in the snow"**, **"close up of a dog"**, **"sony radio"**, or click **Random**.')
     if random_xq:
         print("r_xq")
         xq, orig_xq = init_random_query()
@@ -216,17 +264,36 @@ if 'xq' in st.session_state:
         refresh_index()
     else:
         # if we want to display images we end up here
-        st.markdown(f"Distance travelled: *{round(calc_dist(), 4)}*")
         # first retrieve images from pinecone
-        st.session_state.matches = get_top_k(st.session_state.xq, top_k=10)
         # once retrieved, display them alongside checkboxes in a form
         with st.form("my_form", clear_on_submit=False):
-            velocity = st.slider("Velocity", 0, 20, 5, key="velocity")
             # we have three columns in the form
             cols = st.columns(3)
-            for i, idx in enumerate(st.session_state.matches.keys()):
                 # the card shows an image and a checkbox
-                cols[i%3].markdown(card(idx), unsafe_allow_html=True)
                 # we access the values of the checkbox via st.session_state[f"input{i}"]
-                cols[i%3].checkbox("Relevant", key=f"input{i}")
-            st.form_submit_button("Tune", on_click=submit)

 from urllib3.exceptions import ProtocolError
 PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]  # app.pinecone.io
+INDEX = "unsplash-25k-clip"
 MODEL_ID = "openai/clip-vit-base-patch32"
 DIMS = 512
 @st.experimental_singleton(show_spinner=False)
 def init_clip():
     tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID)
     meta_field = datetime.now().isoformat()
     return meta_field, pinecone.Index(INDEX)
+@st.experimental_singleton(show_spinner=False)
+def init_query_num():
+    print("init query_num")
+    return 0
+def query(xq, top_k=10, include_values=True, include_metadata=True, filter=None):
     logging.info(f"Query to Pinecone with '{st.session_state.meta}'")
     attempt = 0
     while attempt < 3:
                 xq,
                 top_k=top_k,
                 include_values=include_values,
+                include_metadata=include_metadata,
                 filter=filter
             )
+            matches = xc['matches']
             break
         except ProtocolError:
             attempt += 1
+            matches = []
+    if len(matches) == 0:
         logging.error(f"No matches found for '{st.session_state.meta}'")
     return matches
         img_bin = base64.b64encode(img_bin).decode('utf-8')
     return img_bin
+def card(i, url):
+    return f'<img id="img{i}" src="{url}" width="200px;">'
+def get_top_k(xq, top_k=9):
     matches = query(
+        xq, top_k=top_k, include_values=True, include_metadata=True,
+        filter={st.session_state.meta: {"$ne": 1}}
     )
     return matches
+def tune(X, y, iters=5):
     # train the classifier
+    print(y)
     st.session_state.clf.fit(X, y, iters=iters)
     # extract new vector
     st.session_state.xq = st.session_state.clf.get_weights()
 def refresh_index():
     logging.info(f"Refresh for '{st.session_state.meta}'")
+    st.session_state.query_num = 0
     xq = st.session_state.xq
     if type(xq) is not list:
         xq = xq.tolist()
     while True:
         matches = query(xq, top_k=100, filter={st.session_state.meta: 1})
+        id_vals = [match['id'] for match in matches]
+        if len(id_vals) == 0: break
+        for i in id_vals:
             st.session_state.index.update(str(i), set_metadata={st.session_state.meta: 0})
     # refresh session states
     del st.session_state.clf, st.session_state.xq
     return np.linalg.norm(xq - orig_xq)
 def submit():
+    st.session_state.query_num += 1
     matches = st.session_state.matches
+    velocity = 2 #st.session_state.velocity
+    scores = {}
     states = [
         st.session_state[f"input{i}"] for i in range(len(matches))
     ]
+    for i, match in enumerate(matches):
+        scores[match['id']] = float(states[i])
         states[i] = False
     # reset states to unchecked
     for i in range(len(matches)):
         st.session_state[f"input{i}"] = False
+    # get training data and labels
+    X = list([match['values'] for match in matches])
+    y = list(scores.values())
+    tune(X, y, iters=velocity)
+    # update record metadata after training
+    for match in matches:
+        st.session_state.index.update(str(match['id']), set_metadata={st.session_state.meta: 1})
 def delete_element(element):
     del element
 />
 """, unsafe_allow_html=True)
+messages = [
+    f"""
+    Welcome to the semantic query trainer app! Here we will demo how to efficiently train
+    a classifier to *very accurately* classify images based on their semantic content.
+    First, we need to initialize the classifier with a simple prompt. Try and write something
+    similar to what you're looking for, or if you want a challenge, try something completely
+    different.
+    """,
+    f"""
+    With the first query we have initialized the classifier weights (they're a 512-d vector)
+    and used those weights to perform a *vector search* to find images embeddings (also 512-d
+    vectors) that closely match the classifier weights.
+    These are essentially the images that the classifier would currently classify as "positive".
+    Based on your *target class* for the classifier, decide how relevant each of the images
+    are below, rating them from -1 (completely irrelevant) to +1 (a perfect match).
+    """,
+    f"""
+    Each of the image embeddings is paired with the *score* that you just gave it. These are
+    all fed into the classifier and used to train it. The classifier learns to *move* towards
+    the positively scored images, and to *avoid* the negatively scored images.
+    """,
+    f"""
+    As we repeat the process, the classifier rapidly learns the target space of our intended
+    class.
+    Typically, we don't train classifiers like this, instead we label a huge dataset and train
+    the classifier across all images and their labels. This is massively inefficient. Here we
+    save annotation and compute time by using vector search to identify and focus on the images
+    that make the *biggest* difference in classifier performance.
+    """,
+    f"""
+    We shouldn't need to repeat this process many times before our classifier converges on our
+    target space. Once we begin returning only relevant images, we can stop training the classifier.
+    *(In this demo, you can try changing your target space and 'traversing' the vector space
+    to the new target space)*
+    """,
+    f"""
+    The app uses the [Pinecone vector database](https://pinecone.io/) to store and query images
+    using vector search. All images are sourced from the [Unsplash Lite dataset](https://huggingface.co/openai/clip-vit-base-patch32) and encoded
+    using [OpenAI's CLIP](https://huggingface.co/openai/clip-vit-base-patch32). We explain how
+    it all works [here](https://classifier-train-vector-search--optimistic-curran-b817a8.netlify.app/learn/classifier-train-vector-search/).
+    """
+]
 with st.spinner("Initializing everything..."):
     st.session_state.meta, st.session_state.index = init_db()
     if 'xq' not in st.session_state:
         tokenizer, clip = init_clip()
+        st.session_state.query_num = 0
+if st.session_state.query_num+1 < len(messages):
+    msg = messages[st.session_state.query_num+1]
+else:
+    msg = messages[-1]
 if 'xq' not in st.session_state:
+    start = [st.empty(), st.empty(), st.empty(), st.empty(), st.empty()]
+    start[0].info(msg, icon="⁉️")
+    prompt = start[1].text_input("Prompt:", value="")
+    prompt_xq = start[2].button("Prompt", disabled=len(prompt) == 0)
+    random_xq = start[3].button("Random", disabled=len(prompt) != 0)
+    start[4].markdown('Not sure what to write? Try **"dogs in the snow"**, **"close up of a dog"**, **"sony radio"**, or click **Random**.')
     if random_xq:
         print("r_xq")
         xq, orig_xq = init_random_query()
         refresh_index()
     else:
         # if we want to display images we end up here
+        st.info(msg, icon="🔎")
         # first retrieve images from pinecone
+        st.session_state.matches = get_top_k(st.session_state.xq, top_k=9)
         # once retrieved, display them alongside checkboxes in a form
         with st.form("my_form", clear_on_submit=False):
+            st.form_submit_button("Tune", on_click=submit)
+            #velocity = st.slider("Velocity", 1, 8, 2, key="velocity")
             # we have three columns in the form
             cols = st.columns(3)
+            for i, match in enumerate(st.session_state.matches):
+                # find good url
+                loc = match["metadata"].get("good_url")
+                if loc:
+                    url = match["metadata"][loc]
+                    if loc == "photo_url":
+                        url += "/download?force=true&w=640"
+                    disabled = False
+                else:
+                    # will show no image, but not sure what else to place here
+                    url = match["metadata"]["photo_url"]
+                    disabled=True
                 # the card shows an image and a checkbox
+                cols[i%3].markdown(card(i, url), unsafe_allow_html=True)
                 # we access the values of the checkbox via st.session_state[f"input{i}"]
+                cols[i%3].slider(
+                    "Relevance",
+                    min_value=-1.0,
+                    max_value=1.0,
+                    value=0.0,
+                    step=0.1,
+                    key=f"input{i}",
+                    disabled=disabled
+                )

link-check.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import pinecone
+import requests
+from tqdm.auto import tqdm
+import logging
+# we run this to check for broken links
+PINECONE_API_KEY = "<<API_KEY_HERE>>"
+INDEX = "unsplash-25k-clip"
+pinecone.init(
+    api_key=PINECONE_API_KEY,
+    environment="us-west1-gcp"
+)
+index = pinecone.Index(INDEX)
+dim = index.describe_index_stats()['dimension']
+total = int(index.describe_index_stats()['totalVectorCount'])
+xq = [0.0] * dim
+count = 0
+ID_LIST = []
+logging.info("Checking links...")
+with tqdm(total=total) as pbar:
+    while True:
+        xc = index.query(
+            xq, top_k=100, include_metadata=True,
+            filter={"link_check": {"$ne": True}}
+        )
+        matches = xc['matches']
+        if len(matches) == 0:
+            break
+        for match in matches:
+            photo_url = match['metadata']['photo_url']+"/download?force=true&w=640"
+            res = requests.get(photo_url)
+            if res.status_code == 200:
+                good_url = "photo_url"
+            else:
+                res = requests.get(match['metadata']['photo_image_url'])
+                if res.status_code == 200:
+                    good_url = "photo_image_url"
+                else:
+                    good_url = "not_found"
+            index.update(match['id'], set_metadata={
+                'good_url': good_url,
+                'link_check': True
+            })
+            ID_LIST.append(match['id'])
+            pbar.update(1)
+logging.info("Refreshing 'link_check' field...")
+for _id in tqdm(ID_LIST):
+    index.update(_id, set_metadata={
+        'link_check': False
+    })

unsplash-25k-clip-indexer.ipynb ADDED Viewed

	@@ -0,0 +1,775 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install transformers pinecone-client tqdm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The dataset used is the [Unsplash Lite dataset](https://github.com/unsplash/datasets)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>photo_id</th>\n",
+       "      <th>photo_url</th>\n",
+       "      <th>photo_image_url</th>\n",
+       "      <th>photo_submitted_at</th>\n",
+       "      <th>photo_featured</th>\n",
+       "      <th>photo_width</th>\n",
+       "      <th>photo_height</th>\n",
+       "      <th>photo_aspect_ratio</th>\n",
+       "      <th>photo_description</th>\n",
+       "      <th>photographer_username</th>\n",
+       "      <th>...</th>\n",
+       "      <th>photo_location_country</th>\n",
+       "      <th>photo_location_city</th>\n",
+       "      <th>stats_views</th>\n",
+       "      <th>stats_downloads</th>\n",
+       "      <th>ai_description</th>\n",
+       "      <th>ai_primary_landmark_name</th>\n",
+       "      <th>ai_primary_landmark_latitude</th>\n",
+       "      <th>ai_primary_landmark_longitude</th>\n",
+       "      <th>ai_primary_landmark_confidence</th>\n",
+       "      <th>blur_hash</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>XMyPniM9LF0</td>\n",
+       "      <td>https://unsplash.com/photos/XMyPniM9LF0</td>\n",
+       "      <td>https://images.unsplash.com/uploads/1411949294...</td>\n",
+       "      <td>2014-09-29 00:08:38.594364</td>\n",
+       "      <td>t</td>\n",
+       "      <td>4272</td>\n",
+       "      <td>2848</td>\n",
+       "      <td>1.50</td>\n",
+       "      <td>Woman exploring a forest</td>\n",
+       "      <td>michellespencer77</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2375421</td>\n",
+       "      <td>6967</td>\n",
+       "      <td>woman walking in the middle of forest</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>L56bVcRRIWMh.gVunlS4SMbsRRxr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>rDLBArZUl1c</td>\n",
+       "      <td>https://unsplash.com/photos/rDLBArZUl1c</td>\n",
+       "      <td>https://images.unsplash.com/photo-141633941111...</td>\n",
+       "      <td>2014-11-18 19:36:57.08945</td>\n",
+       "      <td>t</td>\n",
+       "      <td>3000</td>\n",
+       "      <td>4000</td>\n",
+       "      <td>0.75</td>\n",
+       "      <td>Succulents in a terrarium</td>\n",
+       "      <td>ugmonk</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>13784815</td>\n",
+       "      <td>82141</td>\n",
+       "      <td>succulent plants in clear glass terrarium</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LvI$4txu%2s:_4t6WUj]xat7RPoe</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>cNDGZ2sQ3Bo</td>\n",
+       "      <td>https://unsplash.com/photos/cNDGZ2sQ3Bo</td>\n",
+       "      <td>https://images.unsplash.com/photo-142014251503...</td>\n",
+       "      <td>2015-01-01 20:02:02.097036</td>\n",
+       "      <td>t</td>\n",
+       "      <td>2564</td>\n",
+       "      <td>1710</td>\n",
+       "      <td>1.50</td>\n",
+       "      <td>Rural winter mountainside</td>\n",
+       "      <td>johnprice</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1302461</td>\n",
+       "      <td>3428</td>\n",
+       "      <td>rocky mountain under gray sky at daytime</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LhMj%NxvM{t7_4t7aeoM%2M{ozj[</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>iuZ_D1eoq9k</td>\n",
+       "      <td>https://unsplash.com/photos/iuZ_D1eoq9k</td>\n",
+       "      <td>https://images.unsplash.com/photo-141487280988...</td>\n",
+       "      <td>2014-11-01 20:15:13.410073</td>\n",
+       "      <td>t</td>\n",
+       "      <td>2912</td>\n",
+       "      <td>4368</td>\n",
+       "      <td>0.67</td>\n",
+       "      <td>Poppy seeds and flowers</td>\n",
+       "      <td>krisatomic</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2890238</td>\n",
+       "      <td>33704</td>\n",
+       "      <td>red common poppy flower selective focus phography</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>LSC7DirZAsX7}Br@GEWWmnoLWCnj</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>BeD3vjQ8SI0</td>\n",
+       "      <td>https://unsplash.com/photos/BeD3vjQ8SI0</td>\n",
+       "      <td>https://images.unsplash.com/photo-141700759404...</td>\n",
+       "      <td>2014-11-26 13:13:50.134383</td>\n",
+       "      <td>t</td>\n",
+       "      <td>4896</td>\n",
+       "      <td>3264</td>\n",
+       "      <td>1.50</td>\n",
+       "      <td>Silhouette near dark trees</td>\n",
+       "      <td>jonaseriksson</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>8704860</td>\n",
+       "      <td>49662</td>\n",
+       "      <td>trees during night time</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>L25|_:V@0hxtI=W;odae0ht6=^NG</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 31 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      photo_id                                photo_url  \\\n",
+       "0  XMyPniM9LF0  https://unsplash.com/photos/XMyPniM9LF0   \n",
+       "1  rDLBArZUl1c  https://unsplash.com/photos/rDLBArZUl1c   \n",
+       "2  cNDGZ2sQ3Bo  https://unsplash.com/photos/cNDGZ2sQ3Bo   \n",
+       "3  iuZ_D1eoq9k  https://unsplash.com/photos/iuZ_D1eoq9k   \n",
+       "4  BeD3vjQ8SI0  https://unsplash.com/photos/BeD3vjQ8SI0   \n",
+       "\n",
+       "                                     photo_image_url  \\\n",
+       "0  https://images.unsplash.com/uploads/1411949294...   \n",
+       "1  https://images.unsplash.com/photo-141633941111...   \n",
+       "2  https://images.unsplash.com/photo-142014251503...   \n",
+       "3  https://images.unsplash.com/photo-141487280988...   \n",
+       "4  https://images.unsplash.com/photo-141700759404...   \n",
+       "\n",
+       "           photo_submitted_at photo_featured  photo_width  photo_height  \\\n",
+       "0  2014-09-29 00:08:38.594364              t         4272          2848   \n",
+       "1   2014-11-18 19:36:57.08945              t         3000          4000   \n",
+       "2  2015-01-01 20:02:02.097036              t         2564          1710   \n",
+       "3  2014-11-01 20:15:13.410073              t         2912          4368   \n",
+       "4  2014-11-26 13:13:50.134383              t         4896          3264   \n",
+       "\n",
+       "   photo_aspect_ratio           photo_description photographer_username  ...  \\\n",
+       "0                1.50    Woman exploring a forest     michellespencer77  ...   \n",
+       "1                0.75   Succulents in a terrarium                ugmonk  ...   \n",
+       "2                1.50   Rural winter mountainside             johnprice  ...   \n",
+       "3                0.67     Poppy seeds and flowers            krisatomic  ...   \n",
+       "4                1.50  Silhouette near dark trees         jonaseriksson  ...   \n",
+       "\n",
+       "  photo_location_country photo_location_city stats_views stats_downloads  \\\n",
+       "0                    NaN                 NaN     2375421            6967   \n",
+       "1                    NaN                 NaN    13784815           82141   \n",
+       "2                    NaN                 NaN     1302461            3428   \n",
+       "3                    NaN                 NaN     2890238           33704   \n",
+       "4                    NaN                 NaN     8704860           49662   \n",
+       "\n",
+       "                                      ai_description ai_primary_landmark_name  \\\n",
+       "0              woman walking in the middle of forest                      NaN   \n",
+       "1          succulent plants in clear glass terrarium                      NaN   \n",
+       "2           rocky mountain under gray sky at daytime                      NaN   \n",
+       "3  red common poppy flower selective focus phography                      NaN   \n",
+       "4                            trees during night time                      NaN   \n",
+       "\n",
+       "  ai_primary_landmark_latitude ai_primary_landmark_longitude  \\\n",
+       "0                          NaN                           NaN   \n",
+       "1                          NaN                           NaN   \n",
+       "2                          NaN                           NaN   \n",
+       "3                          NaN                           NaN   \n",
+       "4                          NaN                           NaN   \n",
+       "\n",
+       "  ai_primary_landmark_confidence                     blur_hash  \n",
+       "0                            NaN  L56bVcRRIWMh.gVunlS4SMbsRRxr  \n",
+       "1                            NaN  LvI$4txu%2s:_4t6WUj]xat7RPoe  \n",
+       "2                            NaN  LhMj%NxvM{t7_4t7aeoM%2M{ozj[  \n",
+       "3                            NaN  LSC7DirZAsX7}Br@GEWWmnoLWCnj  \n",
+       "4                            NaN  L25|_:V@0hxtI=W;odae0ht6=^NG  \n",
+       "\n",
+       "[5 rows x 31 columns]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "images = pd.read_csv('photos.tsv000', delimiter='\\t')\n",
+    "images.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We download using the `photo_image_url` field."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "import requests\n",
+    "from io import BytesIO\n",
+    "\n",
+    "url = images['photo_image_url'].iloc[0]\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "img = Image.open(BytesIO(response.content))\n",
+    "img"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to use these images to create vector embeddings, to do this we will use OpenAI's CLIP from the `transformers` library.\n",
+    "\n",
+    "```\n",
+    "!pip install transformers\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-08-12 14:07:47.935784: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
+      "ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import CLIPProcessor, CLIPModel\n",
+    "import torch\n",
+    "\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "model_name = \"openai/clip-vit-base-patch32\"\n",
+    "\n",
+    "model = CLIPModel.from_pretrained(model_name).to(device)\n",
+    "processor = CLIPProcessor.from_pretrained(model_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we're ready to use the vision transformer (ViT) portion of CLIP to create feature vectors (embedding representations) from the image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img = processor(\n",
+    "    text=None,\n",
+    "    images=img,\n",
+    "    return_tensors='pt',\n",
+    "    padding=True\n",
+    ")['pixel_values'].to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 512])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "out = model.get_image_features(pixel_values=img)\n",
+    "out.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([512])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "out = out.squeeze(0)\n",
+    "out.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(512,)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "emb = out.cpu().detach().numpy()\n",
+    "emb.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(-7.985501, 2.0108054)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "emb.min(), emb.max()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we have a single `512` dimensional vector that represents the *meaning* behind the image. As we will be using dot product similarity we should also normalize these vectors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "emb = emb / np.linalg.norm(emb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(-0.56626415, 0.13343191)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "emb.min(), emb.max()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Indexing\n",
+    "\n",
+    "To index this image in Pinecone we first install the Pinecone client:\n",
+    "\n",
+    "```\n",
+    "!pip install pinecone-client\n",
+    "```\n",
+    "\n",
+    "And then initialize our connection to Pinecone, this requires a [free API key](https://app.pinecone.io/)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pinecone\n",
+    "\n",
+    "index_name = \"unsplash-25k-clip\"\n",
+    "\n",
+    "pinecone.init(\n",
+    "    api_key=\"<<API_KEY_HERE>>\",\n",
+    "    environment=\"us-west1-gcp\"\n",
+    ")\n",
+    "\n",
+    "if index_name not in pinecone.list_indexes():\n",
+    "    pinecone.create_index(\n",
+    "        index_name,\n",
+    "        emb.shape[0],\n",
+    "        metric=\"dotproduct\"\n",
+    "    )\n",
+    "# connect to the index\n",
+    "index = pinecone.Index(index_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To upsert the single feature embedding we have created, we use `upsert`. There are also some possibly relevant metadata info we might want to add."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'photo_url': 'https://unsplash.com/photos/XMyPniM9LF0',\n",
+       " 'photo_image_url': 'https://images.unsplash.com/uploads/14119492946973137ce46/f1f2ebf3',\n",
+       " 'photo_submitted_at': '2014-09-29 00:08:38.594364',\n",
+       " 'photo_description': 'Woman exploring a forest',\n",
+       " 'photographer_username': 'michellespencer77',\n",
+       " 'ai_description': 'woman walking in the middle of forest'}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "row = images.iloc[0]\n",
+    "\n",
+    "_id = row['photo_id']\n",
+    "meta = {\n",
+    "    \"photo_url\": row[\"photo_url\"],\n",
+    "    \"photo_image_url\": row[\"photo_image_url\"],\n",
+    "    \"photo_submitted_at\": row[\"photo_submitted_at\"],\n",
+    "    \"photo_description\": row[\"photo_description\"],\n",
+    "    \"photographer_username\": row[\"photographer_username\"],\n",
+    "    \"ai_description\": row[\"ai_description\"]\n",
+    "}\n",
+    "\n",
+    "meta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'upserted_count': 1}"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "to_upsert = [(_id, emb.tolist(), meta)]\n",
+    "\n",
+    "index.upsert(to_upsert)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'XMyPniM9LF0'"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "_id"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "Note that we added a string ID value `\"XMyPniM9LF0\"` and also converted the feature embedding tensor to a flat list before adding to our Pinecone index.\n",
+    "\n",
+    "## Indexing Everything\n",
+    "\n",
+    "So far we've built one feature embedding and indexed it in Pinecone, now let's repeat the process for a lot of images.\n",
+    "\n",
+    "We will do this in batches, taking `32` images at a time, embedding them with Resnet-50, and indexing them in Pinecone."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6726c0eb47de4cd780f3e1096a2d743f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1370 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (99996755 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (96768910 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (99991727 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (143040000 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (94212096 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (121500000 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (107424768 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (147015000 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (107184040 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (146784000 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (90671520 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (99992815 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (95808000 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (121554000 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (91177320 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (99996120 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (96000000 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n",
+      "/opt/conda/lib/python3.7/site-packages/PIL/Image.py:2899: DecompressionBombWarning: Image size (98058240 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.\n",
+      "  DecompressionBombWarning,\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm.auto import tqdm\n",
+    "batch_size = 16\n",
+    "images_len = len(images)\n",
+    "\n",
+    "exceptions = []\n",
+    "\n",
+    "for i in tqdm(range(3088, images_len, batch_size)):\n",
+    "    # select the batch start and end\n",
+    "    i_end = min(i + batch_size, images_len)\n",
+    "    # get batch\n",
+    "    batch = images.iloc[i:i_end]\n",
+    "    # retrieve URLs\n",
+    "    url_batch = batch['photo_image_url']\n",
+    "    # get images\n",
+    "    image_batch = []\n",
+    "    for url in url_batch:\n",
+    "        try:\n",
+    "            response = requests.get(url)\n",
+    "            img = Image.open(BytesIO(response.content))\n",
+    "            if img.mode in ['L', 'CMYK', 'RGBA']:\n",
+    "                # L is grayscale, CMYK uses alternative color channels\n",
+    "                img = img.convert('RGB')\n",
+    "            image_batch.append(img)\n",
+    "        except Exception as e:\n",
+    "            exceptions.append((\"url\", e))\n",
+    "    # process images and extract pytorch tensor pixel values\n",
+    "    try:\n",
+    "        image_batch = processor(\n",
+    "            text=None,\n",
+    "            images=image_batch,\n",
+    "            return_tensors='pt'\n",
+    "        )['pixel_values'].to(device)\n",
+    "        # feed tensors to model and extract last state\n",
+    "        out = model.get_image_features(pixel_values=image_batch)\n",
+    "        out = out.squeeze(0)\n",
+    "        # take the mean across each dimension to create a single vector embedding\n",
+    "        embeds = out.cpu().detach().numpy()\n",
+    "        # normalize and convert to list\n",
+    "        embeds = embeds / np.linalg.norm(embeds, axis=0)\n",
+    "        embeds = embeds.tolist()\n",
+    "        # get ID values\n",
+    "        ids = batch['photo_id']\n",
+    "        # prep metadata\n",
+    "        metadata = batch[[\n",
+    "            \"photo_url\", \"photo_image_url\", \"photo_submitted_at\",\n",
+    "            \"photo_description\", \"photographer_username\", \"ai_description\"\n",
+    "        ]].fillna(\"\").to_dict(orient=\"records\")\n",
+    "        # zip all data together and upsert\n",
+    "        to_upsert = zip(ids, embeds, metadata)\n",
+    "        index.upsert(to_upsert)\n",
+    "    except Exception as e:\n",
+    "        exceptions.append((\"process\", e))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  }
+ ],
+ "metadata": {
+  "environment": {
+   "kernel": "python3",
+   "name": "common-cu110.m91",
+   "type": "gcloud",
+   "uri": "gcr.io/deeplearning-platform-release/base-cu110:m91"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "9ec8fc8fb845fc3e050bf8bf651a355c069bbfeddee31167baf4bc42b6050476"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}