{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%pip install sentence-transformers==2.0.0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Load dataset with pandas" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DescriptionUnitPriceCountry
0WHITE HANGING HEART T-LIGHT HOLDER2.55United Kingdom
1WHITE METAL LANTERN3.39United Kingdom
2CREAM CUPID HEARTS COAT HANGER2.75United Kingdom
3KNITTED UNION FLAG HOT WATER BOTTLE3.39United Kingdom
4RED WOOLLY HOTTIE WHITE HEART.3.39United Kingdom
............
535327????damages????0.00United Kingdom
535329mixed up0.00United Kingdom
535335lost0.00United Kingdom
537621CREAM HANGING HEART T-LIGHT HOLDER2.95United Kingdom
540421PAPER CRAFT , LITTLE BIRDIE2.08United Kingdom
\n", "

4223 rows × 3 columns

\n", "
" ], "text/plain": [ " Description UnitPrice Country\n", "0 WHITE HANGING HEART T-LIGHT HOLDER 2.55 United Kingdom\n", "1 WHITE METAL LANTERN 3.39 United Kingdom\n", "2 CREAM CUPID HEARTS COAT HANGER 2.75 United Kingdom\n", "3 KNITTED UNION FLAG HOT WATER BOTTLE 3.39 United Kingdom\n", "4 RED WOOLLY HOTTIE WHITE HEART. 3.39 United Kingdom\n", "... ... ... ...\n", "535327 ????damages???? 0.00 United Kingdom\n", "535329 mixed up 0.00 United Kingdom\n", "535335 lost 0.00 United Kingdom\n", "537621 CREAM HANGING HEART T-LIGHT HOLDER 2.95 United Kingdom\n", "540421 PAPER CRAFT , LITTLE BIRDIE 2.08 United Kingdom\n", "\n", "[4223 rows x 3 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv('products.csv')\n", "df = df[['Description', 'UnitPrice', 'Country']]\n", "df = df.dropna().drop_duplicates(subset=['Description'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "2. Encode 100 samples into vectors (1 column with product text, 1 column with vectors)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from tqdm import tqdm\n", "from sentence_transformers import SentenceTransformer\n", "tqdm.pandas()\n", "\n", "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n", "\n", "#encode df version: for small dataset only\n", "df['text_vector_'] = df['Description'].progress_apply(lambda x : model.encode(x).tolist())\n", "df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df.to_parquet('df_encoded.parquet', index=None)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.0 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "fdf377d643bc1cb065454f0ad2ceac75d834452ecf289e7ba92c6b3f59a7cee1" } } }, "nbformat": 4, "nbformat_minor": 2 }