{ "cells": [ { "cell_type": "code", "execution_count": 7, "id": "b60885fa-9b72-494f-80ca-976b0ab2d897", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "\n", "pd.set_option('display.max_colwidth', 100)" ] }, { "cell_type": "code", "execution_count": 8, "id": "3d42b739-8c64-46e2-b0d6-cb8a4fe05e08", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8, 2)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('sample_text.csv')\n", "df.shape" ] }, { "cell_type": "code", "execution_count": 9, "id": "c26485ef-4276-4c77-bae3-292b065d0ba5", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/homebrew/anaconda3/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "(8, 768)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sentence_transformers import SentenceTransformer\n", "\n", "encoder = SentenceTransformer(\"all-mpnet-base-v2\")\n", "vectors = encoder.encode(df.text)\n", "vectors.shape" ] }, { "cell_type": "code", "execution_count": 11, "id": "4f1d88ea-bdb1-4066-94c6-b4d0e6eee442", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "768" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dim = vectors.shape[1]\n", "dim" ] }, { "cell_type": "code", "execution_count": 12, "id": "d9dd2469-59b1-4c28-882f-790685abcb3a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " >" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import faiss\n", "\n", "# index that uses L2 distance to do faster search later on \n", "index = faiss.IndexFlatL2() \n", "\n", "index = faiss.IndexFlatL2(dim)\n", "\n", "index\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "e43fe842-0b3b-41e7-b562-a8371bafcd40", "metadata": {}, "outputs": [], "source": [ "index.add(vectors)" ] }, { "cell_type": "code", "execution_count": 29, "id": "eb8ca9ac-4bbb-47f8-b99b-46fc8a6831f2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(768,)" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "search_query=\"finance\" \n", "\n", "vec = encoder.encode(search_query)\n", "vec.shape" ] }, { "cell_type": "code", "execution_count": 30, "id": "99e18af2-0958-4d94-bc94-77812031fcf4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1, 768)" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np \n", "svec = np.array(vec).reshape(1, -1)\n", "svec.shape" ] }, { "cell_type": "code", "execution_count": 31, "id": "c8d0671e-81d9-4cd3-bbe9-3809b8dc366c", "metadata": {}, "outputs": [], "source": [ "distances, I = index.search(svec, k=2)" ] }, { "cell_type": "code", "execution_count": 32, "id": "ac60d14e-3e3e-4c7c-b548-d739859a6fe8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[5, 2]])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "I" ] }, { "cell_type": "code", "execution_count": 33, "id": "7e3cd9f0-f7b8-4d36-88ba-afe2d293c31b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textcategory
5Navaratri dandiya program at Expo center in Mumbai this octoberEvent
2These are the latest fashion trends for this weekFashion
\n", "
" ], "text/plain": [ " text category\n", "5 Navaratri dandiya program at Expo center in Mumbai this october Event\n", "2 These are the latest fashion trends for this week Fashion" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[I[0]]" ] }, { "cell_type": "code", "execution_count": 34, "id": "e387b515-7213-4116-97ae-3c4bbcb4a846", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textcategory
0Meditation and yoga can improve mental healthHealth
1Fruits, whole grains and vegetables helps control blood pressureHealth
2These are the latest fashion trends for this weekFashion
3Vibrant color jeans for male are becoming a trendFashion
4The concert starts at 7 PM tonightEvent
5Navaratri dandiya program at Expo center in Mumbai this octoberEvent
6Exciting vacation destinations for your next tripTravel
7Maldives and Srilanka are gaining popularity in terms of low budget vacation placesTravel
\n", "
" ], "text/plain": [ " text \\\n", "0 Meditation and yoga can improve mental health \n", "1 Fruits, whole grains and vegetables helps control blood pressure \n", "2 These are the latest fashion trends for this week \n", "3 Vibrant color jeans for male are becoming a trend \n", "4 The concert starts at 7 PM tonight \n", "5 Navaratri dandiya program at Expo center in Mumbai this october \n", "6 Exciting vacation destinations for your next trip \n", "7 Maldives and Srilanka are gaining popularity in terms of low budget vacation places \n", "\n", " category \n", "0 Health \n", "1 Health \n", "2 Fashion \n", "3 Fashion \n", "4 Event \n", "5 Event \n", "6 Travel \n", "7 Travel " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }