In [None]:
!pip install transformers pinecone-client tqdm

The dataset used is the [Unsplash Lite dataset](https://github.com/unsplash/datasets).

In [2]:
import pandas as pd

images = pd.read_csv('photos.tsv000', delimiter='\t')
images.head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_country,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash
0,XMyPniM9LF0,https://unsplash.com/photos/XMyPniM9LF0,https://images.unsplash.com/uploads/1411949294...,2014-09-29 00:08:38.594364,t,4272,2848,1.5,Woman exploring a forest,michellespencer77,...,,,2375421,6967,woman walking in the middle of forest,,,,,L56bVcRRIWMh.gVunlS4SMbsRRxr
1,rDLBArZUl1c,https://unsplash.com/photos/rDLBArZUl1c,https://images.unsplash.com/photo-141633941111...,2014-11-18 19:36:57.08945,t,3000,4000,0.75,Succulents in a terrarium,ugmonk,...,,,13784815,82141,succulent plants in clear glass terrarium,,,,,LvI$4txu%2s:_4t6WUj]xat7RPoe
2,cNDGZ2sQ3Bo,https://unsplash.com/photos/cNDGZ2sQ3Bo,https://images.unsplash.com/photo-142014251503...,2015-01-01 20:02:02.097036,t,2564,1710,1.5,Rural winter mountainside,johnprice,...,,,1302461,3428,rocky mountain under gray sky at daytime,,,,,LhMj%NxvM{t7_4t7aeoM%2M{ozj[
3,iuZ_D1eoq9k,https://unsplash.com/photos/iuZ_D1eoq9k,https://images.unsplash.com/photo-141487280988...,2014-11-01 20:15:13.410073,t,2912,4368,0.67,Poppy seeds and flowers,krisatomic,...,,,2890238,33704,red common poppy flower selective focus phography,,,,,LSC7DirZAsX7}Br@GEWWmnoLWCnj
4,BeD3vjQ8SI0,https://unsplash.com/photos/BeD3vjQ8SI0,https://images.unsplash.com/photo-141700759404...,2014-11-26 13:13:50.134383,t,4896,3264,1.5,Silhouette near dark trees,jonaseriksson,...,,,8704860,49662,trees during night time,,,,,L25|_:V@0hxtI=W;odae0ht6=^NG


We download using the `photo_image_url` field.

In [4]:
from PIL import Image
import requests
from io import BytesIO

url = images['photo_image_url'].iloc[0]

response = requests.get(url)
img = Image.open(BytesIO(response.content))
img

We need to use these images to create vector embeddings, to do this we will use OpenAI's CLIP from the `transformers` library.

```
!pip install transformers
```

In [5]:
from transformers import CLIPProcessor, CLIPModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-base-patch32"

model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)

2022-08-12 14:07:47.935784: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


Now we're ready to use the vision transformer (ViT) portion of CLIP to create feature vectors (embedding representations) from the image.

In [6]:
img = processor(
    text=None,
    images=img,
    return_tensors='pt',
    padding=True
)['pixel_values'].to(device)

In [7]:
out = model.get_image_features(pixel_values=img)
out.shape

torch.Size([1, 512])

In [8]:
out = out.squeeze(0)
out.shape

torch.Size([512])

In [9]:
emb = out.cpu().detach().numpy()
emb.shape

(512,)

In [10]:
emb.min(), emb.max()

(-7.985501, 2.0108054)

Now we have a single `512` dimensional vector that represents the *meaning* behind the image. As we will be using dot product similarity we should also normalize these vectors.

In [10]:
import numpy as np

emb = emb / np.linalg.norm(emb)

In [11]:
emb.min(), emb.max()

(-0.56626415, 0.13343191)

## Indexing

To index this image in Pinecone we first install the Pinecone client:

```
!pip install pinecone-client
```

And then initialize our connection to Pinecone, this requires a [free API key](https://app.pinecone.io/).

In [11]:
import pinecone

index_name = "unsplash-25k-clip"

pinecone.init(
    api_key="<<API_KEY_HERE>>",
    environment="us-west1-gcp"
)

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        emb.shape[0],
        metric="dotproduct"
    )
# connect to the index
index = pinecone.Index(index_name)

To upsert the single feature embedding we have created, we use `upsert`. There are also some possibly relevant metadata info we might want to add.

In [13]:
row = images.iloc[0]

_id = row['photo_id']
meta = {
    "photo_url": row["photo_url"],
    "photo_image_url": row["photo_image_url"],
    "photo_submitted_at": row["photo_submitted_at"],
    "photo_description": row["photo_description"],
    "photographer_username": row["photographer_username"],
    "ai_description": row["ai_description"]
}

meta

{'photo_url': 'https://unsplash.com/photos/XMyPniM9LF0',
 'photo_image_url': 'https://images.unsplash.com/uploads/14119492946973137ce46/f1f2ebf3',
 'photo_submitted_at': '2014-09-29 00:08:38.594364',
 'photo_description': 'Woman exploring a forest',
 'photographer_username': 'michellespencer77',
 'ai_description': 'woman walking in the middle of forest'}

In [14]:
to_upsert = [(_id, emb.tolist(), meta)]

index.upsert(to_upsert)

{'upserted_count': 1}

In [15]:
_id

'XMyPniM9LF0'

Note that we added a string ID value `"XMyPniM9LF0"` and also converted the feature embedding tensor to a flat list before adding to our Pinecone index.

## Indexing Everything

So far we've built one feature embedding and indexed it in Pinecone, now let's repeat the process for a lot of images.

We will do this in batches, taking `32` images at a time, embedding them with Resnet-50, and indexing them in Pinecone.

In [23]:
import numpy as np

In [79]:
from tqdm.auto import tqdm
batch_size = 16
images_len = len(images)

exceptions = []

for i in tqdm(range(3088, images_len, batch_size)):
    # select the batch start and end
    i_end = min(i + batch_size, images_len)
    # get batch
    batch = images.iloc[i:i_end]
    # retrieve URLs
    url_batch = batch['photo_image_url']
    # get images
    image_batch = []
    for url in url_batch:
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            if img.mode in ['L', 'CMYK', 'RGBA']:
                # L is grayscale, CMYK uses alternative color channels
                img = img.convert('RGB')
            image_batch.append(img)
        except Exception as e:
            exceptions.append(("url", e))
    # process images and extract pytorch tensor pixel values
    try:
        image_batch = processor(
            text=None,
            images=image_batch,
            return_tensors='pt'
        )['pixel_values'].to(device)
        # feed tensors to model and extract last state
        out = model.get_image_features(pixel_values=image_batch)
        out = out.squeeze(0)
        # take the mean across each dimension to create a single vector embedding
        embeds = out.cpu().detach().numpy()
        # normalize and convert to list
        embeds = embeds / np.linalg.norm(embeds, axis=0)
        embeds = embeds.tolist()
        # get ID values
        ids = batch['photo_id']
        # prep metadata
        metadata = batch[[
            "photo_url", "photo_image_url", "photo_submitted_at",
            "photo_description", "photographer_username", "ai_description"
        ]].fillna("").to_dict(orient="records")
        # zip all data together and upsert
        to_upsert = zip(ids, embeds, metadata)
        index.upsert(to_upsert)
    except Exception as e:
        exceptions.append(("process", e))

  0%|          | 0/1370 [00:00<?, ?it/s]



---