{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "small-delaware", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.\n", "\n", "We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.\n", "\n", "mediapipe 0.8.1 requires opencv-python, which is not installed.\n", "djitellopy 2.4.0 requires opencv-python, which is not installed.\n", "umojahack2023 0.1 requires numpy==1.23.5, but you'll have numpy 1.24.4 which is incompatible.\n", "tensorflow 2.11.0 requires gast<=0.4.0,>=0.2.1, but you'll have gast 0.5.3 which is incompatible.\n", "streamlit 1.3.1 requires click<8.0,>=7.0, but you'll have click 8.1.3 which is incompatible.\n", "pandas-profiling 3.2.0 requires joblib~=1.1.0, but you'll have joblib 1.2.0 which is incompatible.\n", "pandas-profiling 3.2.0 requires markupsafe~=2.1.1, but you'll have markupsafe 2.0.1 which is incompatible.\n", "mediapipe 0.8.1 requires numpy==1.19.3, but you'll have numpy 1.24.4 which is incompatible.\n", "huggingface-hub 0.14.1 requires packaging>=20.9, but you'll have packaging 20.8 which is incompatible.\n", "google-api-core 2.11.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you'll have protobuf 3.19.3 which is incompatible.\u001b[0m\n" ] } ], "source": [ "!pip install -qU pinecone-client \\\n", " tqdm \\\n", " httpimport \\\n", " requests" ] }, { "cell_type": "code", "execution_count": 2, "id": "legal-course", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/paul/miniconda3/lib/python3.8/site-packages/pinecone/index.py:4: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", " from tqdm.autonotebook import tqdm\n" ] } ], "source": [ "import os\n", "import requests\n", "\n", "import tqdm\n", "import httpimport\n", "import pinecone\n", "import numpy as np\n", "from PIL import Image\n", "\n", "DATA_DIRECTORY = 'tmp'\n", "INDEX_NAME = 'jumia-product-search'\n", "INDEX_DIMENSION = 512\n", "BATCH_SIZE=200" ] }, { "cell_type": "code", "execution_count": 3, "id": "political-robertson", "metadata": {}, "outputs": [], "source": [ "PINECONE_API_KEY = \"22e8b95b-18a6-42b4-8824-dc65f08a60e1\"\n", "PINECONE_ENV = \"us-west1-gcp-free\"" ] }, { "cell_type": "code", "execution_count": 4, "id": "contained-financing", "metadata": {}, "outputs": [], "source": [ "pinecone.init(api_key= PINECONE_API_KEY,\n", " environment=PINECONE_ENV)\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "biblical-decision", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'dimension': 512,\n", " 'index_fullness': 0.0,\n", " 'namespaces': {},\n", " 'total_vector_count': 0}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index_name = 'jumia-product-embeddings'\n", "\n", "if index_name not in pinecone.list_indexes():\n", " # now create the new index\n", " pinecone.create_index(\n", " index_name,\n", " dimension= 512, # 768\n", " metric='cosine',\n", " pod_type='s1',\n", " pods=1\n", " )\n", "\n", "# connect to index\n", "index = pinecone.Index(index_name)\n", "# then check index status\n", "index.describe_index_stats()" ] }, { "cell_type": "code", "execution_count": 8, "id": "advance-composite", "metadata": {}, "outputs": [], "source": [ "import joblib\n", "\n", "\n", "EMBED_FILE = \"../image_search_engine/artifacts/embeddings/embed_2023-07-09_15-17-45.pkl\"\n", "\n", "\n", "def load_serialized_object(file_path):\n", " try:\n", " obj = joblib.load(file_path)\n", " return obj\n", " except FileNotFoundError:\n", " print(f\"File not found: {file_path}\")\n", " except Exception as e:\n", " print(f\"Error loading serialized object: {str(e)}\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "lasting-wyoming", "metadata": {}, "outputs": [], "source": [ "embeddings = load_serialized_object(EMBED_FILE)" ] }, { "cell_type": "code", "execution_count": 48, "id": "dramatic-superintendent", "metadata": {}, "outputs": [], "source": [ "embeddings = [embed.tolist() for embed in embeddings]\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "formal-million", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2941" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(embed)" ] }, { "cell_type": "code", "execution_count": 15, "id": "equipped-conversion", "metadata": {}, "outputs": [], "source": [ "import json\n", "def load_json_file(file_path):\n", " with open(file_path, \"r\") as file:\n", " data = json.load(file)\n", " return data" ] }, { "cell_type": "code", "execution_count": 16, "id": "veterinary-greenhouse", "metadata": {}, "outputs": [], "source": [ "json_data = load_json_file(\"processed/jumia_3650/jumia_3650.json\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "important-literacy", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8b01efeef77e42268c86219a3dab6a5a", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/5 [00:00