{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import csv" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Specify the path to your CSV file\n", "file_path = 'styles.csv'\n", "\n", "headers = []\n", "data = []\n", "\n", "# Open the CSV file\n", "with open(file_path, mode='r', newline='') as file:\n", " # Create a CSV reader object\n", " csv_reader = csv.reader(file)\n", " \n", " # Iterate over each row in the CSV file\n", " for row in csv_reader:\n", " # Print each row (you can replace this with any other operation you want to perform on the row)\n", " headers = row\n", " break\n", " for row in csv_reader:\n", " if len(row) != 10:\n", " pass\n", " else:\n", " data.append(row)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idgendermasterCategorysubCategoryarticleTypebaseColourseasonyearusageproductDisplayName
015970MenApparelTopwearShirtsNavy BlueFall2011CasualTurtle Check Men Navy Blue Shirt
139386MenApparelBottomwearJeansBlueSummer2012CasualPeter England Men Party Blue Jeans
259263WomenAccessoriesWatchesWatchesSilverWinter2016CasualTitan Women Silver Watch
321379MenApparelBottomwearTrack PantsBlackFall2011CasualManchester United Men Solid Black Track Pants
453759MenApparelTopwearTshirtsGreySummer2012CasualPuma Men Grey T-shirt
.................................
4441917036MenFootwearShoesCasual ShoesWhiteSummer2013CasualGas Men Caddy Casual Shoe
444206461MenFootwearFlip FlopsFlip FlopsRedSummer2011CasualLotto Men's Soccer Track Flip Flop
4442118842MenApparelTopwearTshirtsBlueFall2011CasualPuma Men Graphic Stellar Blue Tshirt
4442246694WomenPersonal CareFragrancePerfume and Body MistBlueSpring2017CasualRasasi Women Blue Lady Perfume
4442351623WomenAccessoriesWatchesWatchesPinkWinter2016CasualFossil Women Pink Dial Chronograph Watch ES3050
\n", "

44424 rows × 10 columns

\n", "
" ], "text/plain": [ " id gender masterCategory subCategory articleType \\\n", "0 15970 Men Apparel Topwear Shirts \n", "1 39386 Men Apparel Bottomwear Jeans \n", "2 59263 Women Accessories Watches Watches \n", "3 21379 Men Apparel Bottomwear Track Pants \n", "4 53759 Men Apparel Topwear Tshirts \n", "... ... ... ... ... ... \n", "44419 17036 Men Footwear Shoes Casual Shoes \n", "44420 6461 Men Footwear Flip Flops Flip Flops \n", "44421 18842 Men Apparel Topwear Tshirts \n", "44422 46694 Women Personal Care Fragrance Perfume and Body Mist \n", "44423 51623 Women Accessories Watches Watches \n", "\n", " baseColour season year usage \\\n", "0 Navy Blue Fall 2011 Casual \n", "1 Blue Summer 2012 Casual \n", "2 Silver Winter 2016 Casual \n", "3 Black Fall 2011 Casual \n", "4 Grey Summer 2012 Casual \n", "... ... ... ... ... \n", "44419 White Summer 2013 Casual \n", "44420 Red Summer 2011 Casual \n", "44421 Blue Fall 2011 Casual \n", "44422 Blue Spring 2017 Casual \n", "44423 Pink Winter 2016 Casual \n", "\n", " productDisplayName \n", "0 Turtle Check Men Navy Blue Shirt \n", "1 Peter England Men Party Blue Jeans \n", "2 Titan Women Silver Watch \n", "3 Manchester United Men Solid Black Track Pants \n", "4 Puma Men Grey T-shirt \n", "... ... \n", "44419 Gas Men Caddy Casual Shoe \n", "44420 Lotto Men's Soccer Track Flip Flop \n", "44421 Puma Men Graphic Stellar Blue Tshirt \n", "44422 Rasasi Women Blue Lady Perfume \n", "44423 Fossil Women Pink Dial Chronograph Watch ES3050 \n", "\n", "[44424 rows x 10 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "styles_df = pd.DataFrame(data,columns=headers)\n", "styles_df" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
filenamelink
015970.jpghttp://assets.myntassets.com/v1/images/style/p...
139386.jpghttp://assets.myntassets.com/v1/images/style/p...
259263.jpghttp://assets.myntassets.com/v1/images/style/p...
321379.jpghttp://assets.myntassets.com/v1/images/style/p...
453759.jpghttp://assets.myntassets.com/v1/images/style/p...
.........
4444117036.jpghttp://assets.myntassets.com/v1/images/style/p...
444426461.jpghttp://assets.myntassets.com/v1/images/style/p...
4444318842.jpghttp://assets.myntassets.com/v1/images/style/p...
4444446694.jpghttp://assets.myntassets.com/v1/images/style/p...
4444551623.jpghttp://assets.myntassets.com/assets/images/516...
\n", "

44446 rows × 2 columns

\n", "
" ], "text/plain": [ " filename link\n", "0 15970.jpg http://assets.myntassets.com/v1/images/style/p...\n", "1 39386.jpg http://assets.myntassets.com/v1/images/style/p...\n", "2 59263.jpg http://assets.myntassets.com/v1/images/style/p...\n", "3 21379.jpg http://assets.myntassets.com/v1/images/style/p...\n", "4 53759.jpg http://assets.myntassets.com/v1/images/style/p...\n", "... ... ...\n", "44441 17036.jpg http://assets.myntassets.com/v1/images/style/p...\n", "44442 6461.jpg http://assets.myntassets.com/v1/images/style/p...\n", "44443 18842.jpg http://assets.myntassets.com/v1/images/style/p...\n", "44444 46694.jpg http://assets.myntassets.com/v1/images/style/p...\n", "44445 51623.jpg http://assets.myntassets.com/assets/images/516...\n", "\n", "[44446 rows x 2 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "images_df = pd.read_csv(\"images.csv\")\n", "images_df" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "styles_df['filename'] = styles_df['id'] + \".jpg\"" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "main_df = pd.DataFrame()\n", "main_df = pd.merge(styles_df,images_df,how=\"inner\",on='filename')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "main_df.drop(columns=['filename'],inplace=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from langchain.docstore.document import Document" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['id', 'gender', 'masterCategory', 'subCategory', 'articleType',\n", " 'baseColour', 'season', 'year', 'usage', 'productDisplayName', 'link'],\n", " dtype='object')" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "main_df.columns" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "def make_text(x):\n", " return (f\"\"\"\n", " Gender is {x['gender']} and \n", " masterCategory is {x['masterCategory']} and \n", " subCategory is {x['subCategory']} and \n", " articleType is {x['articleType']} and \n", " baseColour is {x['baseColour']} and \n", " season is {x['season']} and \n", " year is {x['year']} and \n", " usage is {x['usage']} and \n", " productDisplayName is {x['productDisplayName']}\n", " \"\"\" , x['id'])" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "docs_content = main_df.apply(lambda x: make_text(x),axis=1)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "docs = []\n", "for text,id in docs_content:\n", " docs.append(Document(page_content=text,metadata={'id':id}))" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "import KEYS\n", "from langchain.vectorstores import Chroma\n", "from langchain_google_genai import GoogleGenerativeAIEmbeddings" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "gemini_embeddings = GoogleGenerativeAIEmbeddings(model=\"models/text-embedding-004\",google_api_key=KEYS.api_key.GOOGLE_API_KEY)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "# Save to disk\n", "vectorstore = Chroma.from_documents(\n", " documents=docs, # Data\n", " embedding=gemini_embeddings, # Embedding model\n", " persist_directory=\"./chroma_db\" # Directory to save data\n", " )" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "import chromadb" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "client = chromadb.PersistentClient(path=\"./chroma_db\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "collection = client.list_collections()[0]" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "data = collection.get(include=['documents','metadatas','embeddings'])\n", "\n", "embedding_data = []\n", "\n", "for id,embedding,_ in zip(data['metadatas'],data['embeddings'],data['documents']):\n", " embedding_data.append([id['id'],embedding])" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "embeds_df = pd.DataFrame(embedding_data,columns=['id','embedding'])" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "final_df = pd.DataFrame()\n", "final_df = pd.merge(main_df,embeds_df,how=\"inner\",on='id')" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "n_steps = 10\n", "step_size = final_df.shape[0] // n_steps + 1\n", "max_index = final_df.shape[0]\n", "for i in range(0,n_steps):\n", " final_df.iloc[i * step_size : min(max_index , (i + 1) * step_size)].reset_index().to_feather(f\"data_{i}.feather\")" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexidgendermasterCategorysubCategoryarticleTypebaseColourseasonyearusageproductDisplayNamelinkembedding
0015970MenApparelTopwearShirtsNavy BlueFall2011CasualTurtle Check Men Navy Blue Shirthttp://assets.myntassets.com/v1/images/style/p...[-0.04959992691874504, 0.030256308615207672, 0...
1139386MenApparelBottomwearJeansBlueSummer2012CasualPeter England Men Party Blue Jeanshttp://assets.myntassets.com/v1/images/style/p...[-0.04374004527926445, 0.0014770406996831298, ...
2259263WomenAccessoriesWatchesWatchesSilverWinter2016CasualTitan Women Silver Watchhttp://assets.myntassets.com/v1/images/style/p...[-0.017907867208123207, -0.008326959796249866,...
3321379MenApparelBottomwearTrack PantsBlackFall2011CasualManchester United Men Solid Black Track Pantshttp://assets.myntassets.com/v1/images/style/p...[-0.06801198422908783, 0.011990022845566273, 0...
4453759MenApparelTopwearTshirtsGreySummer2012CasualPuma Men Grey T-shirthttp://assets.myntassets.com/v1/images/style/p...[-0.08272361010313034, 0.017822109162807465, 0...
..........................................
44324441917036MenFootwearShoesCasual ShoesWhiteSummer2013CasualGas Men Caddy Casual Shoehttp://assets.myntassets.com/v1/images/style/p...[-0.062232204, -0.011351791, -0.0027062385, 0....
4433444206461MenFootwearFlip FlopsFlip FlopsRedSummer2011CasualLotto Men's Soccer Track Flip Flophttp://assets.myntassets.com/v1/images/style/p...[-0.06830623, 0.023115562, 0.028224507, 0.0283...
44344442118842MenApparelTopwearTshirtsBlueFall2011CasualPuma Men Graphic Stellar Blue Tshirthttp://assets.myntassets.com/v1/images/style/p...[-0.058011707, 0.022569647, 0.0293498, 0.00073...
44354442246694WomenPersonal CareFragrancePerfume and Body MistBlueSpring2017CasualRasasi Women Blue Lady Perfumehttp://assets.myntassets.com/v1/images/style/p...[-0.0127937365, 0.002269573, 0.016845196, -0.0...
44364442351623WomenAccessoriesWatchesWatchesPinkWinter2016CasualFossil Women Pink Dial Chronograph Watch ES3050http://assets.myntassets.com/assets/images/516...[-0.007802535, -0.030409014, -0.01775647, -0.0...
\n", "

44424 rows × 13 columns

\n", "
" ], "text/plain": [ " index id gender masterCategory subCategory articleType \\\n", "0 0 15970 Men Apparel Topwear Shirts \n", "1 1 39386 Men Apparel Bottomwear Jeans \n", "2 2 59263 Women Accessories Watches Watches \n", "3 3 21379 Men Apparel Bottomwear Track Pants \n", "4 4 53759 Men Apparel Topwear Tshirts \n", "... ... ... ... ... ... ... \n", "4432 44419 17036 Men Footwear Shoes Casual Shoes \n", "4433 44420 6461 Men Footwear Flip Flops Flip Flops \n", "4434 44421 18842 Men Apparel Topwear Tshirts \n", "4435 44422 46694 Women Personal Care Fragrance Perfume and Body Mist \n", "4436 44423 51623 Women Accessories Watches Watches \n", "\n", " baseColour season year usage \\\n", "0 Navy Blue Fall 2011 Casual \n", "1 Blue Summer 2012 Casual \n", "2 Silver Winter 2016 Casual \n", "3 Black Fall 2011 Casual \n", "4 Grey Summer 2012 Casual \n", "... ... ... ... ... \n", "4432 White Summer 2013 Casual \n", "4433 Red Summer 2011 Casual \n", "4434 Blue Fall 2011 Casual \n", "4435 Blue Spring 2017 Casual \n", "4436 Pink Winter 2016 Casual \n", "\n", " productDisplayName \\\n", "0 Turtle Check Men Navy Blue Shirt \n", "1 Peter England Men Party Blue Jeans \n", "2 Titan Women Silver Watch \n", "3 Manchester United Men Solid Black Track Pants \n", "4 Puma Men Grey T-shirt \n", "... ... \n", "4432 Gas Men Caddy Casual Shoe \n", "4433 Lotto Men's Soccer Track Flip Flop \n", "4434 Puma Men Graphic Stellar Blue Tshirt \n", "4435 Rasasi Women Blue Lady Perfume \n", "4436 Fossil Women Pink Dial Chronograph Watch ES3050 \n", "\n", " link \\\n", "0 http://assets.myntassets.com/v1/images/style/p... \n", "1 http://assets.myntassets.com/v1/images/style/p... \n", "2 http://assets.myntassets.com/v1/images/style/p... \n", "3 http://assets.myntassets.com/v1/images/style/p... \n", "4 http://assets.myntassets.com/v1/images/style/p... \n", "... ... \n", "4432 http://assets.myntassets.com/v1/images/style/p... \n", "4433 http://assets.myntassets.com/v1/images/style/p... \n", "4434 http://assets.myntassets.com/v1/images/style/p... \n", "4435 http://assets.myntassets.com/v1/images/style/p... \n", "4436 http://assets.myntassets.com/assets/images/516... \n", "\n", " embedding \n", "0 [-0.04959992691874504, 0.030256308615207672, 0... \n", "1 [-0.04374004527926445, 0.0014770406996831298, ... \n", "2 [-0.017907867208123207, -0.008326959796249866,... \n", "3 [-0.06801198422908783, 0.011990022845566273, 0... \n", "4 [-0.08272361010313034, 0.017822109162807465, 0... \n", "... ... \n", "4432 [-0.062232204, -0.011351791, -0.0027062385, 0.... \n", "4433 [-0.06830623, 0.023115562, 0.028224507, 0.0283... \n", "4434 [-0.058011707, 0.022569647, 0.0293498, 0.00073... \n", "4435 [-0.0127937365, 0.002269573, 0.016845196, -0.0... \n", "4436 [-0.007802535, -0.030409014, -0.01775647, -0.0... \n", "\n", "[44424 rows x 13 columns]" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame()\n", "for i in range(0,n_steps):\n", " df = pd.concat([df,pd.read_feather(f\"data_{i}.feather\")])" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "import google.generativeai as genai\n", "import numpy as np\n", "genai.configure(api_key=KEYS.api_key.GOOGLE_API_KEY)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "def get_results(top_n = 6,query = \"men shirt\"):\n", " query_embedding = genai.embed_content(model=\"models/text-embedding-004\",\n", " content=query,\n", " task_type=\"retrieval_query\")['embedding']\n", " scores = final_df['embedding'].apply(lambda x: np.dot(x,query_embedding))\n", " scores = scores.sort_values(ascending=False)[0:top_n]\n", " return final_df.loc[scores.index][['productDisplayName','link']].to_numpy()" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([['French Connection Men Black Shirt',\n", " 'http://assets.myntassets.com/v1/images/style/properties/9a6ead385ff56471fc6f376da807c617_images.jpg'],\n", " ['Flying Machine Men Check Green Shirts',\n", " 'http://assets.myntassets.com/v1/images/style/properties/3f4c47753110a3f6c093e6bc3e4123df_images.jpg'],\n", " ['Peter England Men Multi Coloured Casual Shirt',\n", " 'http://assets.myntassets.com/v1/images/style/properties/19d194298ca2009e5cc6f0184f2a0f7d_images.jpg'],\n", " ['Flying Machine Men Printed White Shirt',\n", " 'http://assets.myntassets.com/v1/images/style/properties/02fc0c611bca206ed4cd2e0bd17bfd8d_images.jpg'],\n", " ['Flying Machine Men Check Blue Shirts',\n", " 'http://assets.myntassets.com/v1/images/style/properties/c2984f58de4e666b5a3fe6a3b8e36f23_images.jpg'],\n", " ['Flying Machine Men Check Blue Shirts',\n", " 'http://assets.myntassets.com/v1/images/style/properties/eaca895bbca83fbc080ce26ced471a0d_images.jpg']],\n", " dtype=object)" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_results()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 2 }