In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Get Data

In [1]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Datasets/clothing_similarity_search.csv')

In [2]:
df.head()

Unnamed: 0,url,title,description
0,https://www.amazon.com/dp/B09BVXT8TJ,SHOKZ OpenRun Pro - Open-Ear Bluetooth Bone Co...,
1,https://www.amazon.com/dp/B094242FV1,SAMSUNG Galaxy Buds 2 True Wireless Earbuds No...,"Samsung Galaxy Buds2 is here, bringing a way f..."
2,https://www.amazon.com/dp/B09LYF2ST7,"BERIBES Bluetooth Headphones Over Ear, 65H Pla...",
3,https://www.amazon.com/dp/B08FCGH2RL,Skullcandy Crusher Evo Wireless Over-Ear Bluet...,Skullcandy Crusher Evo Wireless Over-Ear Headp...
4,https://www.amazon.com/dp/B000AJIF4E,Sony MDR7506 Professional Large Diaphragm Head...,Product DescriptionSony MDR7506 Professional S...


In [3]:
# Using DataFrame.apply() and lambda function
df["product"] = df['title'].fillna('') + df['description'].fillna('')
df.head()

Unnamed: 0,url,title,description,product
0,https://www.amazon.com/dp/B09BVXT8TJ,SHOKZ OpenRun Pro - Open-Ear Bluetooth Bone Co...,,SHOKZ OpenRun Pro - Open-Ear Bluetooth Bone Co...
1,https://www.amazon.com/dp/B094242FV1,SAMSUNG Galaxy Buds 2 True Wireless Earbuds No...,"Samsung Galaxy Buds2 is here, bringing a way f...",SAMSUNG Galaxy Buds 2 True Wireless Earbuds No...
2,https://www.amazon.com/dp/B09LYF2ST7,"BERIBES Bluetooth Headphones Over Ear, 65H Pla...",,"BERIBES Bluetooth Headphones Over Ear, 65H Pla..."
3,https://www.amazon.com/dp/B08FCGH2RL,Skullcandy Crusher Evo Wireless Over-Ear Bluet...,Skullcandy Crusher Evo Wireless Over-Ear Headp...,Skullcandy Crusher Evo Wireless Over-Ear Bluet...
4,https://www.amazon.com/dp/B000AJIF4E,Sony MDR7506 Professional Large Diaphragm Head...,Product DescriptionSony MDR7506 Professional S...,Sony MDR7506 Professional Large Diaphragm Head...


In [4]:
# Using DataFrame.copy() create new DaraFrame.
clothing_data = df[['url', 'product']].copy()
clothing_data.head()

Unnamed: 0,url,product
0,https://www.amazon.com/dp/B09BVXT8TJ,SHOKZ OpenRun Pro - Open-Ear Bluetooth Bone Co...
1,https://www.amazon.com/dp/B094242FV1,SAMSUNG Galaxy Buds 2 True Wireless Earbuds No...
2,https://www.amazon.com/dp/B09LYF2ST7,"BERIBES Bluetooth Headphones Over Ear, 65H Pla..."
3,https://www.amazon.com/dp/B08FCGH2RL,Skullcandy Crusher Evo Wireless Over-Ear Bluet...
4,https://www.amazon.com/dp/B000AJIF4E,Sony MDR7506 Professional Large Diaphragm Head...


## Preprocessing

In [5]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize the text into individual words
    tokens = word_tokenize(text)
    
    # Lowercase the tokens
    tokens = [token.lower() for token in tokens]
    
    # Remove special characters and punctuation
    tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [6]:
import pandas as pd

preprocessed_products = []
for index, row in clothing_data.iterrows():
    preprocessed_product = preprocess_text(row['product'])
    preprocessed_products.append(preprocessed_product)

# Add the preprocessed text to a new column in the clothing_data
clothing_data['preprocessed_product'] = preprocessed_products


In [7]:
clothing_data.head()

Unnamed: 0,url,product,preprocessed_product
0,https://www.amazon.com/dp/B09BVXT8TJ,SHOKZ OpenRun Pro - Open-Ear Bluetooth Bone Co...,shokz openrun pro openear bluetooth bone cond...
1,https://www.amazon.com/dp/B094242FV1,SAMSUNG Galaxy Buds 2 True Wireless Earbuds No...,samsung galaxy bud 2 true wireless earbuds noi...
2,https://www.amazon.com/dp/B09LYF2ST7,"BERIBES Bluetooth Headphones Over Ear, 65H Pla...",beribes bluetooth headphone ear 65h playtime ...
3,https://www.amazon.com/dp/B08FCGH2RL,Skullcandy Crusher Evo Wireless Over-Ear Bluet...,skullcandy crusher evo wireless overear blueto...
4,https://www.amazon.com/dp/B000AJIF4E,Sony MDR7506 Professional Large Diaphragm Head...,sony mdr7506 professional large diaphragm head...


In [8]:
clothing_data.to_csv('clothing_data_preprocessed.csv', encoding = 'utf-8-sig')

In [9]:
pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.14.1-py3-

In [10]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(clothing_data['preprocessed_product'], convert_to_tensor=True)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## Save the whole data embeddings in npy file

In [12]:
# Move the embeddings tensor from GPU to CPU
import numpy as np
embeddings_cpu = embeddings.cpu()

# Convert the CPU tensor to a NumPy array
embeddings_array = embeddings_cpu.numpy()

# Save the NumPy array to file
np.save('embeddingsnpy', embeddings_array)

## Similarity Search

In [13]:
query = "Men's jeans black and comfortable"
query_embedding = model.encode([query], convert_to_tensor=True)
similarity_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
sorted_indices = similarity_scores.argsort(descending=True)

top_k = 5
similar_indices = sorted_indices[:top_k].cpu().numpy()
similar_urls = clothing_data.loc[similar_indices, 'url']

In [14]:
similar_urls

2495    https://www.amazon.in/dp/B0BWNB6D36
2428    https://www.amazon.in/dp/B0C24BH9ZF
2302    https://www.amazon.in/dp/B09B4CFP89
2362    https://www.amazon.in/dp/B0BFF5RMPX
2264    https://www.amazon.in/dp/B08KWFRY6W
Name: url, dtype: object