{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os; os.chdir('..')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] } ], "source": [ "import pandas as pd\n", "from utils.get_category import predict, get_top_labels" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IndexKeyword
01guide to headphones
12headphone guide
23buy headphones guide
34choosing headphones guide
45sony headphones guide
.........
272273guidelines 2022
273274guidelines 2023
274275ts guidelines
275276guided drawing
276277guided meditation
\n", "

277 rows × 2 columns

\n", "
" ], "text/plain": [ " Index Keyword\n", "0 1 guide to headphones\n", "1 2 headphone guide\n", "2 3 buy headphones guide\n", "3 4 choosing headphones guide\n", "4 5 sony headphones guide\n", ".. ... ...\n", "272 273 guidelines 2022\n", "273 274 guidelines 2023\n", "274 275 ts guidelines\n", "275 276 guided drawing\n", "276 277 guided meditation\n", "\n", "[277 rows x 2 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df= pd.read_csv(\n", " 'data_test/keywords-2.csv'\n", ")\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predicted Class: Computers_and_Electronics \n", "probabilities_scores: 1.0\n", "\n" ] }, { "data": { "text/plain": [ "{'P(Hobbies_and_Leisure)': 0.034,\n", " 'P(News)': 0.005,\n", " 'P(Science)': 0.008,\n", " 'P(Autos_and_Vehicles)': 0.002,\n", " 'P(Health)': 0.009,\n", " 'P(Pets_and_Animals)': 0.003,\n", " 'P(Adult)': 0.017,\n", " 'P(Computers_and_Electronics)': 1.0,\n", " 'P(Online Communities)': 0.019,\n", " 'P(Beauty_and_Fitness)': 0.007,\n", " 'P(People_and_Society)': 0.0,\n", " 'P(Business_and_Industrial)': 0.001,\n", " 'P(Reference)': 0.007,\n", " 'P(Shopping)': 0.173,\n", " 'P(Travel_and_Transportation)': 0.001,\n", " 'P(Food_and_Drink)': 0.012,\n", " 'P(Law_and_Government)': 0.022,\n", " 'P(Books_and_Literature)': 0.001,\n", " 'P(Finance)': 0.01,\n", " 'P(Games)': 0.069,\n", " 'P(Home_and_Garden)': 0.011,\n", " 'P(Jobs_and_Education)': 0.001,\n", " 'P(Arts_and_Entertainment)': 0.005,\n", " 'P(Sensitive Subjects)': 0.003,\n", " 'P(Real Estate)': 0.009,\n", " 'P(Internet_and_Telecom)': 0.063,\n", " 'P(Sports)': 0.014,\n", " 'Predicted Label': 'Computers_and_Electronics',\n", " 'Predicted Label Score': 1.0}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict('cat ear headphones')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df['Category']= df.Keyword.map(get_top_labels)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# import numpy as np\n", "# for i in range(len(df.Category)):\n", "# # print(type(df.Category[i]))\n", "# # df.Category[i]= df.Category[i]['Predicted Label']\n", "# # df.loc[i, 'Category']= f\"{df.Category[i]['Predicted Label']}, {str(np.round(df.Category[i]['Predicted Label Score'],2))}\"\n", "# df.loc[i, 'Probablity Score']= f\"{str(np.round(df.Category[i]['Predicted Label Score'],2))}\"\n", "# df.loc[i, 'Category']= f\"{df.Category[i]['Predicted Label']}\"\n", " \n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IndexKeywordCategory
01guide to headphones[(Computers_and_Electronics, 1.0)]
12headphone guide[(Computers_and_Electronics, 0.999)]
23buy headphones guide[(Shopping, 0.997), (Computers_and_Electronics...
34choosing headphones guide[(Computers_and_Electronics, 1.0)]
45sony headphones guide[(Computers_and_Electronics, 1.0)]
............
272273guidelines 2022[(Computers_and_Electronics, 1.0)]
273274guidelines 2023[(Computers_and_Electronics, 1.0)]
274275ts guidelines[(Computers_and_Electronics, 0.933), (Games, 0...
275276guided drawing[(Reference, 0.995)]
276277guided meditation[(Beauty_and_Fitness, 1.0), (Hobbies_and_Leisu...
\n", "

277 rows × 3 columns

\n", "
" ], "text/plain": [ " Index Keyword \\\n", "0 1 guide to headphones \n", "1 2 headphone guide \n", "2 3 buy headphones guide \n", "3 4 choosing headphones guide \n", "4 5 sony headphones guide \n", ".. ... ... \n", "272 273 guidelines 2022 \n", "273 274 guidelines 2023 \n", "274 275 ts guidelines \n", "275 276 guided drawing \n", "276 277 guided meditation \n", "\n", " Category \n", "0 [(Computers_and_Electronics, 1.0)] \n", "1 [(Computers_and_Electronics, 0.999)] \n", "2 [(Shopping, 0.997), (Computers_and_Electronics... \n", "3 [(Computers_and_Electronics, 1.0)] \n", "4 [(Computers_and_Electronics, 1.0)] \n", ".. ... \n", "272 [(Computers_and_Electronics, 1.0)] \n", "273 [(Computers_and_Electronics, 1.0)] \n", "274 [(Computers_and_Electronics, 0.933), (Games, 0... \n", "275 [(Reference, 0.995)] \n", "276 [(Beauty_and_Fitness, 1.0), (Hobbies_and_Leisu... \n", "\n", "[277 rows x 3 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\n", " 'data_test/labelled_data.csv',\n", " index=False\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IndexKeywordCategoryProbablity Score
9394reddit headphones buying guideInternet_and_Telecom0.98
116117turtle beach headset guidePets_and_Animals0.99
120121earbuds buying guide redditOnline Communities0.91
208209guidesSports0.7
224225guide bookBooks_and_Literature0.86
225226guide for saleShopping1.0
226227guide for schoolJobs_and_Education0.84
227228guide for studentsOther0.57
248249guidebookBooks_and_Literature0.91
251252guide redditOnline Communities1.0
260261hr guide interview questionsJobs_and_Education1.0
262263vi beginners guideHobbies_and_Leisure0.9
265266guidelinesSensitive Subjects1.0
270271guided readingBooks_and_Literature1.0
271272guided reading levelBooks_and_Literature0.98
275276guided drawingOther0.56
276277guided meditationBeauty_and_Fitness1.0
\n", "
" ], "text/plain": [ " Index Keyword Category \\\n", "93 94 reddit headphones buying guide Internet_and_Telecom \n", "116 117 turtle beach headset guide Pets_and_Animals \n", "120 121 earbuds buying guide reddit Online Communities \n", "208 209 guides Sports \n", "224 225 guide book Books_and_Literature \n", "225 226 guide for sale Shopping \n", "226 227 guide for school Jobs_and_Education \n", "227 228 guide for students Other \n", "248 249 guidebook Books_and_Literature \n", "251 252 guide reddit Online Communities \n", "260 261 hr guide interview questions Jobs_and_Education \n", "262 263 vi beginners guide Hobbies_and_Leisure \n", "265 266 guidelines Sensitive Subjects \n", "270 271 guided reading Books_and_Literature \n", "271 272 guided reading level Books_and_Literature \n", "275 276 guided drawing Other \n", "276 277 guided meditation Beauty_and_Fitness \n", "\n", " Probablity Score \n", "93 0.98 \n", "116 0.99 \n", "120 0.91 \n", "208 0.7 \n", "224 0.86 \n", "225 1.0 \n", "226 0.84 \n", "227 0.57 \n", "248 0.91 \n", "251 1.0 \n", "260 1.0 \n", "262 0.9 \n", "265 1.0 \n", "270 1.0 \n", "271 0.98 \n", "275 0.56 \n", "276 1.0 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# df[df.Category!='Computers_and_Electronics']" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IndexKeywordCategoryProbablity Score
227228guide for studentsOther0.57
275276guided drawingOther0.56
\n", "
" ], "text/plain": [ " Index Keyword Category Probablity Score\n", "227 228 guide for students Other 0.57\n", "275 276 guided drawing Other 0.56" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.Category=='Other']" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Computers_and_Electronics', 1.0)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_top_labels(\n", " 'turtle beach headset guide'\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predicted Class: Computers_and_Electronics \n", "probabilities_scores: 0.9980000257492065\n", "\n" ] }, { "data": { "text/plain": [ "{'P(Hobbies_and_Leisure)': 0.221,\n", " 'P(News)': 0.001,\n", " 'P(Science)': 0.032,\n", " 'P(Autos_and_Vehicles)': 0.013,\n", " 'P(Health)': 0.004,\n", " 'P(Pets_and_Animals)': 0.162,\n", " 'P(Adult)': 0.013,\n", " 'P(Computers_and_Electronics)': 0.998,\n", " 'P(Online Communities)': 0.255,\n", " 'P(Beauty_and_Fitness)': 0.016,\n", " 'P(People_and_Society)': 0.0,\n", " 'P(Business_and_Industrial)': 0.001,\n", " 'P(Reference)': 0.003,\n", " 'P(Shopping)': 0.083,\n", " 'P(Travel_and_Transportation)': 0.006,\n", " 'P(Food_and_Drink)': 0.01,\n", " 'P(Law_and_Government)': 0.005,\n", " 'P(Books_and_Literature)': 0.001,\n", " 'P(Finance)': 0.006,\n", " 'P(Games)': 0.045,\n", " 'P(Home_and_Garden)': 0.021,\n", " 'P(Jobs_and_Education)': 0.001,\n", " 'P(Arts_and_Entertainment)': 0.004,\n", " 'P(Sensitive Subjects)': 0.002,\n", " 'P(Real Estate)': 0.009,\n", " 'P(Internet_and_Telecom)': 0.029,\n", " 'P(Sports)': 0.014,\n", " 'Predicted Label': 'Computers_and_Electronics',\n", " 'Predicted Label Score': 0.998}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict(\n", " 'turtle beach headphones guide'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os; os.chdir('..')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] } ], "source": [ "from utils.get_category import get_top_labels, predict" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Food_and_Drink', 0.989),\n", " ('Computers_and_Electronics', 0.973),\n", " ('Games', 0.172),\n", " ('Shopping', 0.134)]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_top_labels(\n", " \"apple\"\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Computers_and_Electronics', 0.999), ('Shopping', 0.993)]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_top_labels(\n", " 'amazon mindkoo headsets with discount'\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Home_and_Garden', 0.999), ('Computers_and_Electronics', 0.243)]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_top_labels(\n", " 'how to use lawn mower'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }