Spaces:

jasonwu92
/

image-search-playground

Runtime error

App Files Files Community

jasonwuyl92 commited on May 23, 2023

Commit

2ab45c8

0 Parent(s):

initial commit after cleanup

Browse files

Files changed (12) hide show

.gitattributes +36 -0
.gitignore +6 -0
README.md +14 -0
app.py +69 -0
app_old.py +38 -0
get_embeddings.ipynb +1047 -0
misc.py +24 -0
requirements.txt +13 -0
run.py +51 -0
streamlit_app.py +39 -0
utils.py +170 -0
vector_db.py +37 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pq filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.DS_Store
+.idea/
+.python-version
+.ipynb_checkpoints/
+__pycache__
+flagged

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Image Search Playground
+emoji: 📈
+colorFrom: red
+colorTo: blue
+sdk: gradio
+sdk_version: 3.30.0
+app_file: app.py
+pinned: false
+license: mit
+python_version: 3.10.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+from functools import partial
+import gradio as gr
+import pandas as pd
+import utils
+import vector_db
+from utils import get_image_embedding, \
+    get_image_path, model_names, download_images, generate_and_save_embeddings, get_metadata_path, url_to_image
+NUM_OUTPUTS = 4
+def search(input_img, model_name):
+    query_embedding = get_image_embedding(model_name, input_img).tolist()
+    top_results = vector_db.query_embeddings_db(query_embedding=query_embedding,
+                                                dataset_name=utils.cur_dataset, model_name=model_name)
+    print (top_results)
+    return [utils.url_to_image(hit['metadata']['mainphotourl']) for hit in top_results['matches']]
+def read_tsv_temporary_file(temp_file_wrapper):
+    dataset_name = os.path.splitext(os.path.basename(temp_file_wrapper.name))[0]
+    utils.set_cur_dataset(dataset_name)
+    df = pd.read_csv(temp_file_wrapper.name, sep='\t')  # Read the TSV content into a pandas DataFrame
+    df.to_csv(os.path.join(get_metadata_path(), dataset_name + '.tsv'), sep='\t', index=False)
+    print('start downloading')
+    download_images(df, get_image_path())
+    generate_and_save_embeddings()
+    utils.refresh_all_datasets()
+    utils.set_cur_dataset(dataset_name)
+    return gr.update(choices=utils.all_datasets, value=dataset_name)
+def update_dataset_dropdown():
+    utils.refresh_all_datasets()
+    utils.set_cur_dataset(utils.all_datasets[0])
+    return gr.update(choices=utils.all_datasets, value=utils.cur_dataset)
+def gen_image_blocks(num_outputs):
+    with gr.Row():
+        row = [gr.outputs.Image(label=model_name, type='filepath') for i in range(int(num_outputs))]
+    return row
+with gr.Blocks() as demo:
+    galleries = dict()
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_upload = gr.File(label="Upload TSV File", file_types=[".tsv"])
+            image_input = gr.inputs.Image(type="pil", label="Input Image")
+            dataset_dropdown = gr.Dropdown(label='Datasets', choices=utils.all_datasets)
+            b1 = gr.Button("Find Similar Images")
+            b2 = gr.Button("Refresh Datasets")
+            dataset_dropdown.select(utils.set_cur_dataset, inputs=dataset_dropdown)
+            file_upload.upload(read_tsv_temporary_file, inputs=file_upload, outputs=dataset_dropdown)
+            b2.click(update_dataset_dropdown, outputs=dataset_dropdown)
+        with gr.Column(scale=3):
+            for model_name in model_names:
+                galleries[model_name] = gen_image_blocks(NUM_OUTPUTS)
+    for model_name in model_names:
+        b1.click(partial(search, model_name=model_name), inputs=[image_input],
+                 outputs=galleries[model_name])
+    b2.click(utils.refresh_all_datasets, outputs=dataset_dropdown)
+demo.launch()

app_old.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import numpy as np
+import gradio as gr
+from sentence_transformers import util as st_util
+import pandas as pd
+import os
+from utils import load_models, get_image_embedding, img_folder, model_name_to_ids, data_path, model_names
+def search(input_img, num_outputs):
+    results = []
+    for model_name in model_names:
+        query_embedding = get_image_embedding(model_name, input_img)
+        top_results = st_util.semantic_search(query_embedding,
+                                           np.vstack(list(corpus_embeddings[model_name + '-embedding'])),
+                                              top_k=int(num_outputs))[0]
+        results.append([os.path.join(img_folder,
+                          corpus_embeddings.iloc[hit['corpus_id']]['name']) for hit in top_results])
+    return results
+load_models()
+corpus_embeddings = pd.read_parquet(
+    os.path.join(data_path, 'metadata/patagonia_losGatos_embeddings.pq'))
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=search,
+    inputs=[gr.Image(type="pil"),
+            gr.inputs.Number(label="Number of results", default=3)],
+    outputs=[gr.Gallery(label=model_name, type='filepath') for model_name in model_names],
+    title="Search Similar Images",
+    description="Upload an image and find similar images",
+)
+# Launch the Gradio interface
+iface.launch(debug=True)

get_embeddings.ipynb ADDED Viewed

	@@ -0,0 +1,1047 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "cannot import name 'data_path' from 'utils' (/Users/yonglinwu/dev/image-search-playground/utils.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[28], line 9\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtorch\u001b[39;00m\n\u001b[1;32m      7\u001b[0m torch\u001b[39m.\u001b[39mset_printoptions(precision\u001b[39m=\u001b[39m\u001b[39m10\u001b[39m)\n\u001b[0;32m----> 9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mutils\u001b[39;00m \u001b[39mimport\u001b[39;00m get_image_embeddings, model_name_to_ids, load_models, model_dict, data_path\n\u001b[1;32m     11\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mwarnings\u001b[39;00m\n\u001b[1;32m     12\u001b[0m warnings\u001b[39m.\u001b[39msimplefilter(action\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mignore\u001b[39m\u001b[39m'\u001b[39m, category\u001b[39m=\u001b[39m\u001b[39mFutureWarning\u001b[39;00m)\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name 'data_path' from 'utils' (/Users/yonglinwu/dev/image-search-playground/utils.py)"
+     ]
+    }
+   ],
+   "source": [
+    "from sentence_transformers import SentenceTransformer, util\n",
+    "from PIL import Image\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "torch.set_printoptions(precision=10)\n",
+    "\n",
+    "from utils import get_image_embeddings, model_name_to_ids, load_models, model_dict, data_path\n",
+    "\n",
+    "import warnings\n",
+    "warnings.simplefilter(action='ignore', category=FutureWarning)\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "patagonia_df = pd.read_csv(data_path + 'metadata/patagonia_losGatos.tsv', sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>brand</th>\n",
+       "      <th>title</th>\n",
+       "      <th>product_url</th>\n",
+       "      <th>price</th>\n",
+       "      <th>description</th>\n",
+       "      <th>size</th>\n",
+       "      <th>category</th>\n",
+       "      <th>colors</th>\n",
+       "      <th>Poshmark</th>\n",
+       "      <th>Unnamed: 9</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Unnamed: 38</th>\n",
+       "      <th>Unnamed: 39</th>\n",
+       "      <th>Unnamed: 40</th>\n",
+       "      <th>Unnamed: 41</th>\n",
+       "      <th>Unnamed: 42</th>\n",
+       "      <th>Unnamed: 43</th>\n",
+       "      <th>Unnamed: 44</th>\n",
+       "      <th>Unnamed: 45</th>\n",
+       "      <th>Unnamed: 46</th>\n",
+       "      <th>Unnamed: 47</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Patagonia</td>\n",
+       "      <td>Patagonia Women's Los Gatos Fleece 1/4-Zip Smo...</td>\n",
+       "      <td>https://poshmark.com/listing/63d4821f2fbf1afe8...</td>\n",
+       "      <td>$36.00</td>\n",
+       "      <td>A soft, warm and versatile quarter-zip pullove...</td>\n",
+       "      <td>M</td>\n",
+       "      <td>Tops</td>\n",
+       "      <td>[{'name': 'Gray', 'rgb': '#929292', 'message_i...</td>\n",
+       "      <td>Poshmark</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Patagonia</td>\n",
+       "      <td>Patagonia Los Gatos 1/4 Zip Pullover M Beech B...</td>\n",
+       "      <td>https://poshmark.com/listing/63fcd7709f212bd48...</td>\n",
+       "      <td>$59.00</td>\n",
+       "      <td>High pile, quarter zip pulllover\\nMeasurements...</td>\n",
+       "      <td>M</td>\n",
+       "      <td>Tops</td>\n",
+       "      <td>[{'name': 'Brown', 'rgb': '#663509', 'message_...</td>\n",
+       "      <td>Poshmark</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Patagonia</td>\n",
+       "      <td>PATAGONIA Women's Los Gatos Fleece 1/4-Zip Pul...</td>\n",
+       "      <td>https://poshmark.com/listing/642b9bbcfed51f812...</td>\n",
+       "      <td>$59.00</td>\n",
+       "      <td>PATAGONIA Women's Los Gatos Fleece 1/4-Zip Pul...</td>\n",
+       "      <td>S</td>\n",
+       "      <td>Tops</td>\n",
+       "      <td>[{'name': 'White', 'rgb': '#FFFFFF', 'message_...</td>\n",
+       "      <td>Poshmark</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Patagonia</td>\n",
+       "      <td>Girl’s Patagonia Los Gatos Fleece 1/4 Zip XS</td>\n",
+       "      <td>https://poshmark.com/listing/63f4f459c5df6c7f8...</td>\n",
+       "      <td>$30.00</td>\n",
+       "      <td>Girl’s Patagonia Los Gatos 1/4 Zip Fleece\\n\\n-...</td>\n",
+       "      <td>XSG</td>\n",
+       "      <td>Other</td>\n",
+       "      <td>[{'name': 'Tan', 'rgb': '#d1b48e', 'message_id...</td>\n",
+       "      <td>Poshmark</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Patagonia</td>\n",
+       "      <td>Patagonia Los Gatos Quarter Zip Grey</td>\n",
+       "      <td>https://poshmark.com/listing/622cc43d3a0db900b...</td>\n",
+       "      <td>$59.00</td>\n",
+       "      <td>Patagonia Los Gatos Quarter Zip Grey \\nWomen’s...</td>\n",
+       "      <td>M</td>\n",
+       "      <td>Tops</td>\n",
+       "      <td>[{'name': 'Gray', 'rgb': '#929292', 'message_i...</td>\n",
+       "      <td>Poshmark</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 48 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       brand                                              title  \\\n",
+       "0  Patagonia  Patagonia Women's Los Gatos Fleece 1/4-Zip Smo...   \n",
+       "1  Patagonia  Patagonia Los Gatos 1/4 Zip Pullover M Beech B...   \n",
+       "2  Patagonia  PATAGONIA Women's Los Gatos Fleece 1/4-Zip Pul...   \n",
+       "3  Patagonia       Girl’s Patagonia Los Gatos Fleece 1/4 Zip XS   \n",
+       "4  Patagonia               Patagonia Los Gatos Quarter Zip Grey   \n",
+       "\n",
+       "                                         product_url   price  \\\n",
+       "0  https://poshmark.com/listing/63d4821f2fbf1afe8...  $36.00   \n",
+       "1  https://poshmark.com/listing/63fcd7709f212bd48...  $59.00   \n",
+       "2  https://poshmark.com/listing/642b9bbcfed51f812...  $59.00   \n",
+       "3  https://poshmark.com/listing/63f4f459c5df6c7f8...  $30.00   \n",
+       "4  https://poshmark.com/listing/622cc43d3a0db900b...  $59.00   \n",
+       "\n",
+       "                                         description size category  \\\n",
+       "0  A soft, warm and versatile quarter-zip pullove...    M     Tops   \n",
+       "1  High pile, quarter zip pulllover\\nMeasurements...    M     Tops   \n",
+       "2  PATAGONIA Women's Los Gatos Fleece 1/4-Zip Pul...    S     Tops   \n",
+       "3  Girl’s Patagonia Los Gatos 1/4 Zip Fleece\\n\\n-...  XSG    Other   \n",
+       "4  Patagonia Los Gatos Quarter Zip Grey \\nWomen’s...    M     Tops   \n",
+       "\n",
+       "                                              colors  Poshmark  Unnamed: 9  \\\n",
+       "0  [{'name': 'Gray', 'rgb': '#929292', 'message_i...  Poshmark       False   \n",
+       "1  [{'name': 'Brown', 'rgb': '#663509', 'message_...  Poshmark       False   \n",
+       "2  [{'name': 'White', 'rgb': '#FFFFFF', 'message_...  Poshmark       False   \n",
+       "3  [{'name': 'Tan', 'rgb': '#d1b48e', 'message_id...  Poshmark       False   \n",
+       "4  [{'name': 'Gray', 'rgb': '#929292', 'message_i...  Poshmark       False   \n",
+       "\n",
+       "   ...  Unnamed: 38 Unnamed: 39  Unnamed: 40  Unnamed: 41  Unnamed: 42  \\\n",
+       "0  ...          NaN         NaN          NaN          NaN          NaN   \n",
+       "1  ...          NaN         NaN          NaN          NaN          NaN   \n",
+       "2  ...          NaN         NaN          NaN          NaN          NaN   \n",
+       "3  ...          NaN         NaN          NaN          NaN          NaN   \n",
+       "4  ...          NaN         NaN          NaN          NaN          NaN   \n",
+       "\n",
+       "   Unnamed: 43  Unnamed: 44  Unnamed: 45  Unnamed: 46  Unnamed: 47  \n",
+       "0          NaN          NaN          NaN          NaN          NaN  \n",
+       "1          NaN          NaN          NaN          NaN          NaN  \n",
+       "2          NaN          NaN          NaN          NaN          NaN  \n",
+       "3          NaN          NaN          NaN          NaN          NaN  \n",
+       "4          NaN          NaN          NaN          NaN          NaN  \n",
+       "\n",
+       "[5 rows x 48 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "patagonia_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#download_images(patagonia_df, data_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "load_models()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def generate_embeddings():\n",
+    "    embeddings_df = pd.DataFrame()\n",
+    "\n",
+    "    # Get image embeddings\n",
+    "    with torch.no_grad():\n",
+    "        for fp in os.listdir(data_path + 'images/'):\n",
+    "            if fp.endswith('.jpg'):\n",
+    "                new_row = {'name': fp}\n",
+    "                for model_name in model_name_to_ids.keys():\n",
+    "                    new_row[f'{model_name}-embedding'] = get_image_embeddings(model_name, Image.open(data_path + 'images/' + fp))\n",
+    "                embeddings_df = embeddings_df.append(new_row, ignore_index=True)\n",
+    "    return embeddings_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "fp = os.listdir(data_path + 'images/')[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "model_name = 'fashion'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "new_row = {'name': fp, f'{model_name}-embedding': get_image_embeddings(model_name, Image.open(data_path + 'images/' + fp))}\n",
+    "                "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "embeddings_df = generate_embeddings()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>sentence-transformer-clip-ViT-L-14-embedding</th>\n",
+       "      <th>fashion-embedding</th>\n",
+       "      <th>openai-clip-embedding</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Women's Under Armour Hustle Fleece Hoodie pull...</td>\n",
+       "      <td>[1.0734258, 0.99022365, 0.32032806, 0.2895219,...</td>\n",
+       "      <td>[0.23177437, -1.9268938, 0.273342, -0.02474568...</td>\n",
+       "      <td>[-0.32902592, -0.09434131, 0.3055967, 0.229937...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Patagonia Los Gatos Fleece Grey Pullover.jpg</td>\n",
+       "      <td>[0.6227796, 0.026531212, 0.45240527, -0.488214...</td>\n",
+       "      <td>[0.38133767, -1.3040155, 1.1697398, -0.3085520...</td>\n",
+       "      <td>[-0.1695469, 0.5067289, 0.31120676, -0.0083701...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>REI Women's Down With It Quilted Hooded Parka ...</td>\n",
+       "      <td>[0.8497103, 1.2925782, -0.21685322, 0.24116844...</td>\n",
+       "      <td>[-0.30043703, -1.3144073, -0.33848628, 0.24008...</td>\n",
+       "      <td>[-0.24841668, 0.4876942, 0.39810008, -0.141552...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Chanel Haute Couture Navy Blue Dress Semi Shee...</td>\n",
+       "      <td>[0.536018, 0.60787296, -0.2751825, 1.0325747, ...</td>\n",
+       "      <td>[-0.101031125, 0.033914, -0.44531134, -0.64656...</td>\n",
+       "      <td>[-0.08328074, 0.19443086, 0.14361368, 0.259305...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Patagonia Women’s S Los Gatos Quarter-Zip Flee...</td>\n",
+       "      <td>[0.79398394, 1.3899276, -0.21383175, 0.0109823...</td>\n",
+       "      <td>[0.60070944, -1.1051046, 1.0572466, 0.47092092...</td>\n",
+       "      <td>[-0.27894062, -0.09589732, 0.5556799, -0.13458...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>326</th>\n",
+       "      <td>Women's REI Elements Jacket Size M.jpg</td>\n",
+       "      <td>[0.6310029, 0.9942212, 0.009293936, 0.7862729,...</td>\n",
+       "      <td>[0.19858713, -1.8665266, -0.3323754, 0.0465058...</td>\n",
+       "      <td>[-0.0952643, 0.8016211, 0.08129032, 0.15187423...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>327</th>\n",
+       "      <td>CHANEL Black cotton bodycon tank dress with zi...</td>\n",
+       "      <td>[1.0761135, 0.18927886, -0.007131472, 0.625682...</td>\n",
+       "      <td>[0.07516122, -0.1886161, 0.1334078, -0.2829321...</td>\n",
+       "      <td>[-0.12297699, 0.026368856, 0.04415588, 0.26031...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>328</th>\n",
+       "      <td>Reformation X Veda Women's Bad Leather Jacket ...</td>\n",
+       "      <td>[0.79690784, 1.2895226, 0.22802149, -0.2736021...</td>\n",
+       "      <td>[-0.12224964, -0.38734418, 0.35824925, 0.95855...</td>\n",
+       "      <td>[0.6507246, 0.27751687, 0.36114892, -0.0831387...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>329</th>\n",
+       "      <td>DISNEY HER UNIVERSE LILO AND STICH Rainbow Qua...</td>\n",
+       "      <td>[1.1617887, 0.19193622, 0.046035454, 0.4334900...</td>\n",
+       "      <td>[-0.20762922, 0.1754938, -0.7334341, -0.106492...</td>\n",
+       "      <td>[-0.31946087, 0.19534132, 0.37351555, -0.09741...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330</th>\n",
+       "      <td>PATAGONIA Nano Puff Jacket Zip Primaloft Insul...</td>\n",
+       "      <td>[0.2912089, 0.72192264, -0.01620815, 0.0022971...</td>\n",
+       "      <td>[0.0026952028, -1.6660439, 0.03839147, -0.2164...</td>\n",
+       "      <td>[0.12799336, 0.75828236, 0.10943861, -0.036647...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>331 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                  name  \\\n",
+       "0    Women's Under Armour Hustle Fleece Hoodie pull...   \n",
+       "1         Patagonia Los Gatos Fleece Grey Pullover.jpg   \n",
+       "2    REI Women's Down With It Quilted Hooded Parka ...   \n",
+       "3    Chanel Haute Couture Navy Blue Dress Semi Shee...   \n",
+       "4    Patagonia Women’s S Los Gatos Quarter-Zip Flee...   \n",
+       "..                                                 ...   \n",
+       "326             Women's REI Elements Jacket Size M.jpg   \n",
+       "327  CHANEL Black cotton bodycon tank dress with zi...   \n",
+       "328  Reformation X Veda Women's Bad Leather Jacket ...   \n",
+       "329  DISNEY HER UNIVERSE LILO AND STICH Rainbow Qua...   \n",
+       "330  PATAGONIA Nano Puff Jacket Zip Primaloft Insul...   \n",
+       "\n",
+       "          sentence-transformer-clip-ViT-L-14-embedding  \\\n",
+       "0    [1.0734258, 0.99022365, 0.32032806, 0.2895219,...   \n",
+       "1    [0.6227796, 0.026531212, 0.45240527, -0.488214...   \n",
+       "2    [0.8497103, 1.2925782, -0.21685322, 0.24116844...   \n",
+       "3    [0.536018, 0.60787296, -0.2751825, 1.0325747, ...   \n",
+       "4    [0.79398394, 1.3899276, -0.21383175, 0.0109823...   \n",
+       "..                                                 ...   \n",
+       "326  [0.6310029, 0.9942212, 0.009293936, 0.7862729,...   \n",
+       "327  [1.0761135, 0.18927886, -0.007131472, 0.625682...   \n",
+       "328  [0.79690784, 1.2895226, 0.22802149, -0.2736021...   \n",
+       "329  [1.1617887, 0.19193622, 0.046035454, 0.4334900...   \n",
+       "330  [0.2912089, 0.72192264, -0.01620815, 0.0022971...   \n",
+       "\n",
+       "                                     fashion-embedding  \\\n",
+       "0    [0.23177437, -1.9268938, 0.273342, -0.02474568...   \n",
+       "1    [0.38133767, -1.3040155, 1.1697398, -0.3085520...   \n",
+       "2    [-0.30043703, -1.3144073, -0.33848628, 0.24008...   \n",
+       "3    [-0.101031125, 0.033914, -0.44531134, -0.64656...   \n",
+       "4    [0.60070944, -1.1051046, 1.0572466, 0.47092092...   \n",
+       "..                                                 ...   \n",
+       "326  [0.19858713, -1.8665266, -0.3323754, 0.0465058...   \n",
+       "327  [0.07516122, -0.1886161, 0.1334078, -0.2829321...   \n",
+       "328  [-0.12224964, -0.38734418, 0.35824925, 0.95855...   \n",
+       "329  [-0.20762922, 0.1754938, -0.7334341, -0.106492...   \n",
+       "330  [0.0026952028, -1.6660439, 0.03839147, -0.2164...   \n",
+       "\n",
+       "                                 openai-clip-embedding  \n",
+       "0    [-0.32902592, -0.09434131, 0.3055967, 0.229937...  \n",
+       "1    [-0.1695469, 0.5067289, 0.31120676, -0.0083701...  \n",
+       "2    [-0.24841668, 0.4876942, 0.39810008, -0.141552...  \n",
+       "3    [-0.08328074, 0.19443086, 0.14361368, 0.259305...  \n",
+       "4    [-0.27894062, -0.09589732, 0.5556799, -0.13458...  \n",
+       "..                                                 ...  \n",
+       "326  [-0.0952643, 0.8016211, 0.08129032, 0.15187423...  \n",
+       "327  [-0.12297699, 0.026368856, 0.04415588, 0.26031...  \n",
+       "328  [0.6507246, 0.27751687, 0.36114892, -0.0831387...  \n",
+       "329  [-0.31946087, 0.19534132, 0.37351555, -0.09741...  \n",
+       "330  [0.12799336, 0.75828236, 0.10943861, -0.036647...  \n",
+       "\n",
+       "[331 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "embeddings_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "embeddings_path = os.path.join(data_path, 'metadata/patagonia_losGatos_embeddings.pq')\n",
+    "embeddings_df.to_parquet(embeddings_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "embeddings_df = pd.read_parquet(embeddings_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "for i, row in embeddings_df.iterrows():\n",
+    "    if '\\n' in row['name']:\n",
+    "        print(row['name'])\n",
+    "        embeddings_df = embeddings_df.drop(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>sentence-transformer-clip-ViT-L-14-embedding</th>\n",
+       "      <th>fashion-embedding</th>\n",
+       "      <th>openai-clip-embedding</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Women's Under Armour Hustle Fleece Hoodie pull...</td>\n",
+       "      <td>[1.0734258, 0.99022365, 0.32032806, 0.2895219,...</td>\n",
+       "      <td>[0.23177437, -1.9268938, 0.273342, -0.02474568...</td>\n",
+       "      <td>[-0.32902592, -0.09434131, 0.3055967, 0.229937...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Patagonia Los Gatos Fleece Grey Pullover.jpg</td>\n",
+       "      <td>[0.6227796, 0.026531212, 0.45240527, -0.488214...</td>\n",
+       "      <td>[0.38133767, -1.3040155, 1.1697398, -0.3085520...</td>\n",
+       "      <td>[-0.1695469, 0.5067289, 0.31120676, -0.0083701...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>REI Women's Down With It Quilted Hooded Parka ...</td>\n",
+       "      <td>[0.8497103, 1.2925782, -0.21685322, 0.24116844...</td>\n",
+       "      <td>[-0.30043703, -1.3144073, -0.33848628, 0.24008...</td>\n",
+       "      <td>[-0.24841668, 0.4876942, 0.39810008, -0.141552...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Chanel Haute Couture Navy Blue Dress Semi Shee...</td>\n",
+       "      <td>[0.536018, 0.60787296, -0.2751825, 1.0325747, ...</td>\n",
+       "      <td>[-0.101031125, 0.033914, -0.44531134, -0.64656...</td>\n",
+       "      <td>[-0.08328074, 0.19443086, 0.14361368, 0.259305...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Patagonia Women’s S Los Gatos Quarter-Zip Flee...</td>\n",
+       "      <td>[0.79398394, 1.3899276, -0.21383175, 0.0109823...</td>\n",
+       "      <td>[0.60070944, -1.1051046, 1.0572466, 0.47092092...</td>\n",
+       "      <td>[-0.27894062, -0.09589732, 0.5556799, -0.13458...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>326</th>\n",
+       "      <td>Women's REI Elements Jacket Size M.jpg</td>\n",
+       "      <td>[0.6310029, 0.9942212, 0.009293936, 0.7862729,...</td>\n",
+       "      <td>[0.19858713, -1.8665266, -0.3323754, 0.0465058...</td>\n",
+       "      <td>[-0.0952643, 0.8016211, 0.08129032, 0.15187423...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>327</th>\n",
+       "      <td>CHANEL Black cotton bodycon tank dress with zi...</td>\n",
+       "      <td>[1.0761135, 0.18927886, -0.007131472, 0.625682...</td>\n",
+       "      <td>[0.07516122, -0.1886161, 0.1334078, -0.2829321...</td>\n",
+       "      <td>[-0.12297699, 0.026368856, 0.04415588, 0.26031...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>328</th>\n",
+       "      <td>Reformation X Veda Women's Bad Leather Jacket ...</td>\n",
+       "      <td>[0.79690784, 1.2895226, 0.22802149, -0.2736021...</td>\n",
+       "      <td>[-0.12224964, -0.38734418, 0.35824925, 0.95855...</td>\n",
+       "      <td>[0.6507246, 0.27751687, 0.36114892, -0.0831387...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>329</th>\n",
+       "      <td>DISNEY HER UNIVERSE LILO AND STICH Rainbow Qua...</td>\n",
+       "      <td>[1.1617887, 0.19193622, 0.046035454, 0.4334900...</td>\n",
+       "      <td>[-0.20762922, 0.1754938, -0.7334341, -0.106492...</td>\n",
+       "      <td>[-0.31946087, 0.19534132, 0.37351555, -0.09741...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330</th>\n",
+       "      <td>PATAGONIA Nano Puff Jacket Zip Primaloft Insul...</td>\n",
+       "      <td>[0.2912089, 0.72192264, -0.01620815, 0.0022971...</td>\n",
+       "      <td>[0.0026952028, -1.6660439, 0.03839147, -0.2164...</td>\n",
+       "      <td>[0.12799336, 0.75828236, 0.10943861, -0.036647...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>331 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                  name  \\\n",
+       "0    Women's Under Armour Hustle Fleece Hoodie pull...   \n",
+       "1         Patagonia Los Gatos Fleece Grey Pullover.jpg   \n",
+       "2    REI Women's Down With It Quilted Hooded Parka ...   \n",
+       "3    Chanel Haute Couture Navy Blue Dress Semi Shee...   \n",
+       "4    Patagonia Women’s S Los Gatos Quarter-Zip Flee...   \n",
+       "..                                                 ...   \n",
+       "326             Women's REI Elements Jacket Size M.jpg   \n",
+       "327  CHANEL Black cotton bodycon tank dress with zi...   \n",
+       "328  Reformation X Veda Women's Bad Leather Jacket ...   \n",
+       "329  DISNEY HER UNIVERSE LILO AND STICH Rainbow Qua...   \n",
+       "330  PATAGONIA Nano Puff Jacket Zip Primaloft Insul...   \n",
+       "\n",
+       "          sentence-transformer-clip-ViT-L-14-embedding  \\\n",
+       "0    [1.0734258, 0.99022365, 0.32032806, 0.2895219,...   \n",
+       "1    [0.6227796, 0.026531212, 0.45240527, -0.488214...   \n",
+       "2    [0.8497103, 1.2925782, -0.21685322, 0.24116844...   \n",
+       "3    [0.536018, 0.60787296, -0.2751825, 1.0325747, ...   \n",
+       "4    [0.79398394, 1.3899276, -0.21383175, 0.0109823...   \n",
+       "..                                                 ...   \n",
+       "326  [0.6310029, 0.9942212, 0.009293936, 0.7862729,...   \n",
+       "327  [1.0761135, 0.18927886, -0.007131472, 0.625682...   \n",
+       "328  [0.79690784, 1.2895226, 0.22802149, -0.2736021...   \n",
+       "329  [1.1617887, 0.19193622, 0.046035454, 0.4334900...   \n",
+       "330  [0.2912089, 0.72192264, -0.01620815, 0.0022971...   \n",
+       "\n",
+       "                                     fashion-embedding  \\\n",
+       "0    [0.23177437, -1.9268938, 0.273342, -0.02474568...   \n",
+       "1    [0.38133767, -1.3040155, 1.1697398, -0.3085520...   \n",
+       "2    [-0.30043703, -1.3144073, -0.33848628, 0.24008...   \n",
+       "3    [-0.101031125, 0.033914, -0.44531134, -0.64656...   \n",
+       "4    [0.60070944, -1.1051046, 1.0572466, 0.47092092...   \n",
+       "..                                                 ...   \n",
+       "326  [0.19858713, -1.8665266, -0.3323754, 0.0465058...   \n",
+       "327  [0.07516122, -0.1886161, 0.1334078, -0.2829321...   \n",
+       "328  [-0.12224964, -0.38734418, 0.35824925, 0.95855...   \n",
+       "329  [-0.20762922, 0.1754938, -0.7334341, -0.106492...   \n",
+       "330  [0.0026952028, -1.6660439, 0.03839147, -0.2164...   \n",
+       "\n",
+       "                                 openai-clip-embedding  \n",
+       "0    [-0.32902592, -0.09434131, 0.3055967, 0.229937...  \n",
+       "1    [-0.1695469, 0.5067289, 0.31120676, -0.0083701...  \n",
+       "2    [-0.24841668, 0.4876942, 0.39810008, -0.141552...  \n",
+       "3    [-0.08328074, 0.19443086, 0.14361368, 0.259305...  \n",
+       "4    [-0.27894062, -0.09589732, 0.5556799, -0.13458...  \n",
+       "..                                                 ...  \n",
+       "326  [-0.0952643, 0.8016211, 0.08129032, 0.15187423...  \n",
+       "327  [-0.12297699, 0.026368856, 0.04415588, 0.26031...  \n",
+       "328  [0.6507246, 0.27751687, 0.36114892, -0.0831387...  \n",
+       "329  [-0.31946087, 0.19534132, 0.37351555, -0.09741...  \n",
+       "330  [0.12799336, 0.75828236, 0.10943861, -0.036647...  \n",
+       "\n",
+       "[331 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "embeddings_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "for fp in os.listdir(data_path + 'images/'):\n",
+    "    if '?' in fp:\n",
+    "        print(fp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "1+1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df.to_csv('random.tsv', sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "disco-io/data\n"
+     ]
+    }
+   ],
+   "source": [
+    "import utils\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from utils import get_immediate_subdirectories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "disco-io/data\n",
+      "Refreshing all datasets: ['test']\n"
+     ]
+    }
+   ],
+   "source": [
+    "utils.refresh_all_datasets()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'test'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "utils.cur_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "disco-io/data\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['test']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_immediate_subdirectories('data')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils import fs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s3_path = 'data'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s3_full_path = f\"{utils.S3_BUCKET}/{s3_path}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['disco-io/data/Cvlsntdjgrnuyrlf.jpg', 'disco-io/data/test']"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fs.glob(f\"{s3_full_path}/*\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fs.isdir('disco-io/data/test')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "e85fcd8d0dbb45c39d3e544566c77318961c8114425a16ff4cb5c14067743b34"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

misc.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import pandas as pd
+import random
+# Function to generate random text for titles
+def generate_random_images_df(filename):
+    def generate_title():
+        title_length = random.randint(5, 20)
+        title = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=title_length))
+        return title.capitalize()
+    # Function to generate random image URLs
+    def generate_image_url():
+        url = "https://picsum.photos/200/300"  # Change the size of the image as per your requirement
+        return url
+    # Create a list of dictionaries with random titles and image URLs
+    data = []
+    for i in range(10):
+        data.append({'title': generate_title(), 'IMG_URL': generate_image_url()})
+    # Convert the list of dictionaries to a Pandas DataFrame
+    df = pd.DataFrame(data)
+    df.to_csv(filename, sep='\t', index=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio==3.30.0
+numpy==1.23.5
+pandas==1.5.3
+pandas_stubs==1.2.0.35
+Pillow==9.5.0
+sentence_transformers==2.2.2
+pyarrow
+transformers~=4.26.1
+tqdm
+streamlit
+s3fs
+requests
+pinecone-client

run.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import gradio as gr
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+    # Animal Generator
+    Once you select a species, the detail panel should be visible.
+    """
+    )
+    species = gr.Radio(label="Animal Class", choices=["Mammal", "Fish", "Bird"])
+    animal = gr.Dropdown(label="Animal", choices=[])
+    with gr.Column(visible=False) as details_col:
+        weight = gr.Slider(0, 20)
+        details = gr.Textbox(label="Extra Details")
+        generate_btn = gr.Button("Generate")
+        output = gr.Textbox(label="Output")
+    species_map = {
+        "Mammal": ["Elephant", "Giraffe", "Hamster"],
+        "Fish": ["Shark", "Salmon", "Tuna"],
+        "Bird": ["Chicken", "Eagle", "Hawk"],
+    }
+    def filter_species(species):
+        return gr.Dropdown.update(
+            choices=species_map[species], value=species_map[species][1]
+        ), gr.update(visible=True)
+    species.change(filter_species, species, [animal, details_col])
+    def filter_weight(animal):
+        if animal in ("Elephant", "Shark", "Giraffe"):
+            return gr.update(maximum=100)
+        else:
+            return gr.update(maximum=20)
+    animal.change(filter_weight, animal, weight)
+    weight.change(lambda w: gr.update(lines=int(w / 10) + 1), weight, details)
+    generate_btn.click(lambda x: x, details, output)
+if __name__ == "__main__":
+    from tqdm import tqdm
+    for i in tqdm(range(int(9e6))):
+        pass
+    #demo.launch()

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import streamlit as st
+import numpy as np
+from PIL import Image
+def process_image(input_image):
+    # Your image processing function goes here
+    output_image = input_image.copy()
+    return output_image
+# Set the title of the web application
+st.title('Multiple Input and Output Images Interface')
+# Create a sidebar for image inputs
+st.sidebar.title('Input Images')
+# Set up a file uploader in the sidebar for each input image
+uploaded_images = []
+num_images = 3 # The number of input images
+for i in range(num_images):
+    uploaded_image = st.sidebar.file_uploader(f'Upload Image {i+1}', type=['png', 'jpg', 'jpeg'])
+    if uploaded_image is not None:
+        uploaded_images.append(uploaded_image)
+# Display input images and process them
+if uploaded_images:
+    st.header('Input Images')
+    input_images = []
+    for img in uploaded_images:
+        input_img = Image.open(img)
+        input_images.append(input_img)
+        st.image(input_img, width=200, caption='Uploaded Image')
+    # Process input images and display output images
+    st.header('Output Images')
+    for input_img in input_images:
+        output_img = process_image(input_img)
+        st.image(output_img, width=200, caption='Processed Image')
+else:
+    st.warning('Please upload images in the sidebar.')

utils.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from sentence_transformers import SentenceTransformer, util as st_util
+from transformers import CLIPModel, CLIPProcessor
+from PIL import Image
+import requests
+import os
+import torch
+torch.set_printoptions(precision=10)
+from tqdm import tqdm
+import s3fs
+from io import BytesIO
+import vector_db
+"sentence-transformer-clip-ViT-L-14"
+"openai-clip"
+model_names = ["fashion"]
+model_name_to_ids = {
+    "sentence-transformer-clip-ViT-L-14": "clip-ViT-L-14",
+    "fashion": "patrickjohncyh/fashion-clip",
+    "openai-clip": "openai/clip-vit-base-patch32",
+}
+AWS_ACCESS_KEY_ID = os.environ["AWS_ACCESS_KEY_ID"]
+AWS_SECRET_ACCESS_KEY = os.environ["AWS_SECRET_ACCESS_KEY"]
+# Define your bucket and dataset name.
+S3_BUCKET = "s3://disco-io"
+fs = s3fs.S3FileSystem(
+    key=AWS_ACCESS_KEY_ID,
+    secret=AWS_SECRET_ACCESS_KEY,
+)
+ROOT_DATA_PATH = os.path.join(S3_BUCKET, 'data')
+def get_data_path():
+    return os.path.join(ROOT_DATA_PATH, cur_dataset)
+def get_image_path():
+    return os.path.join(get_data_path(), 'images')
+def get_metadata_path():
+    return os.path.join(get_data_path(), 'metadata')
+def get_embeddings_path():
+    return os.path.join(get_metadata_path(), cur_dataset + '_embeddings.pq')
+model_dict = dict()
+def download_to_s3(url, s3_path):
+    # Download the file from the URL
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    # Upload the file to the S3 path
+    with fs.open(s3_path, "wb") as s3_file:
+        for chunk in response.iter_content(chunk_size=8192):
+            s3_file.write(chunk)
+def remove_all_files_from_s3_directory(s3_directory):
+    # List all objects in the S3 directory
+    objects = fs.ls(s3_directory)
+    # Remove each object
+    for obj in objects:
+        try:
+            fs.rm(obj)
+        except:
+            print('Error removing file: ' + obj)
+def download_images(df, img_folder):
+    remove_all_files_from_s3_directory(img_folder)
+    for index, row in df.iterrows():
+        try:
+            download_to_s3(row['IMG_URL'], os.path.join(img_folder,
+                                                        row['title'].replace('/', '_').replace('\n', '') + '.jpg'))
+        except:
+            print('Error downloading image: ' + str(index) + row['title'])
+def load_models():
+    for model_name in model_name_to_ids:
+        if model_name not in model_dict:
+            model_dict[model_name] = dict()
+            if model_name.startswith('sentence-transformer'):
+                model_dict[model_name]['model'] = SentenceTransformer(model_name_to_ids[model_name])
+            else:
+                model_dict[model_name]['hf_dir'] = model_name_to_ids[model_name]
+                model_dict[model_name]['model'] = CLIPModel.from_pretrained(model_name_to_ids[model_name])
+                model_dict[model_name]['processor'] = CLIPProcessor.from_pretrained(model_name_to_ids[model_name])
+if len(model_dict) == 0:
+    print('Loading models...')
+    load_models()
+def get_image_embedding(model_name, image):
+    """
+    Takes an image as input and returns an embedding vector.
+    """
+    model = model_dict[model_name]['model']
+    if model_name.startswith('sentence-transformer'):
+        return model.encode(image)
+    else:
+        inputs = model_dict[model_name]['processor'](images=image, return_tensors="pt")
+        image_features = model.get_image_features(**inputs).detach().numpy()[0]
+        return image_features
+def s3_path_to_image(fs, s3_path):
+    """
+    Takes an S3 path as input and returns a PIL Image object.
+    Args:
+        s3_path (str): The path to the image in the S3 bucket, including the bucket name (e.g., "bucket_name/path/to/image.jpg").
+    Returns:
+        Image: A PIL Image object.
+    """
+    with fs.open(s3_path, "rb") as f:
+        image_data = BytesIO(f.read())
+        img = Image.open(image_data)
+        return img
+def generate_and_save_embeddings():
+    # Get image embeddings
+    with torch.no_grad():
+        for fp in tqdm(fs.ls(get_image_path()), desc="Generate embeddings for Images"):
+            if fp.endswith('.jpg'):
+                name = fp.split('/')[-1]
+                for model_name in model_name_to_ids.keys():
+                    s3_path = 's3://' + fp
+                    vector_db.add_image_embedding_to_db(
+                        embedding=get_image_embedding(model_name, s3_path_to_image(fs, s3_path)),
+                        model_name=model_name,
+                        dataset_name=cur_dataset,
+                        path_to_image=s3_path,
+                        image_name=name,
+                    )
+def get_immediate_subdirectories(s3_path):
+    return [obj.split('/')[-1] for obj in fs.glob(f"{s3_path}/*") if fs.isdir(obj)]
+all_datasets = get_immediate_subdirectories(ROOT_DATA_PATH)
+cur_dataset = all_datasets[0]
+def set_cur_dataset(dataset):
+    refresh_all_datasets()
+    print(f"Setting current dataset to {dataset}")
+    global cur_dataset
+    cur_dataset = dataset
+def refresh_all_datasets():
+    global all_datasets
+    all_datasets = get_immediate_subdirectories(ROOT_DATA_PATH)
+    print(f"Refreshing all datasets: {all_datasets}")
+def url_to_image(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        img = Image.open(BytesIO(response.content))
+        return img
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching image from URL: {url}")
+        return None

vector_db.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import pinecone
+import os
+import uuid
+pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment="us-west1-gcp")
+INDEX_512_NAME = "images-512"
+INDEX_768_NAME = "images-768"
+index_512 = pinecone.Index(INDEX_512_NAME)
+index_768 = pinecone.Index(INDEX_768_NAME)
+DEV_NAMESPACE = 'disco-web-app-search-dev'
+PROD_NAMESPACE = 'disco-web-app-search-prod'
+def add_image_embedding_to_db(embedding, model_name, dataset_name, path_to_image, image_name):
+    index = {
+        512: index_512,
+        768: index_768
+    }[embedding.shape[0]]
+    print (embedding.shape)
+    index.upsert([(str(uuid.uuid4()), embedding.tolist(), {'model': model_name,
+                                                           'dataset': dataset_name,
+                                                           'path': path_to_image,
+                                                           'image_name': image_name})])
+def query_embeddings_db(query_embedding, dataset_name, model_name, top_k=4):
+    index = {
+        512: index_512,
+        768: index_768
+    }[len(query_embedding)]
+    return index.query(vector=query_embedding,
+                       top_k=top_k,
+                       namespace=DEV_NAMESPACE,
+                       include_metadata=True)