{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os; os.chdir('..')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
}
],
"source": [
"import pandas as pd\n",
"from utils.get_category import predict, get_top_labels"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Index | \n",
" Keyword | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" guide to headphones | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" headphone guide | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" buy headphones guide | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" choosing headphones guide | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" sony headphones guide | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 272 | \n",
" 273 | \n",
" guidelines 2022 | \n",
"
\n",
" \n",
" 273 | \n",
" 274 | \n",
" guidelines 2023 | \n",
"
\n",
" \n",
" 274 | \n",
" 275 | \n",
" ts guidelines | \n",
"
\n",
" \n",
" 275 | \n",
" 276 | \n",
" guided drawing | \n",
"
\n",
" \n",
" 276 | \n",
" 277 | \n",
" guided meditation | \n",
"
\n",
" \n",
"
\n",
"
277 rows × 2 columns
\n",
"
"
],
"text/plain": [
" Index Keyword\n",
"0 1 guide to headphones\n",
"1 2 headphone guide\n",
"2 3 buy headphones guide\n",
"3 4 choosing headphones guide\n",
"4 5 sony headphones guide\n",
".. ... ...\n",
"272 273 guidelines 2022\n",
"273 274 guidelines 2023\n",
"274 275 ts guidelines\n",
"275 276 guided drawing\n",
"276 277 guided meditation\n",
"\n",
"[277 rows x 2 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df= pd.read_csv(\n",
" 'data_test/keywords-2.csv'\n",
")\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Predicted Class: Computers_and_Electronics \n",
"probabilities_scores: 1.0\n",
"\n"
]
},
{
"data": {
"text/plain": [
"{'P(Hobbies_and_Leisure)': 0.034,\n",
" 'P(News)': 0.005,\n",
" 'P(Science)': 0.008,\n",
" 'P(Autos_and_Vehicles)': 0.002,\n",
" 'P(Health)': 0.009,\n",
" 'P(Pets_and_Animals)': 0.003,\n",
" 'P(Adult)': 0.017,\n",
" 'P(Computers_and_Electronics)': 1.0,\n",
" 'P(Online Communities)': 0.019,\n",
" 'P(Beauty_and_Fitness)': 0.007,\n",
" 'P(People_and_Society)': 0.0,\n",
" 'P(Business_and_Industrial)': 0.001,\n",
" 'P(Reference)': 0.007,\n",
" 'P(Shopping)': 0.173,\n",
" 'P(Travel_and_Transportation)': 0.001,\n",
" 'P(Food_and_Drink)': 0.012,\n",
" 'P(Law_and_Government)': 0.022,\n",
" 'P(Books_and_Literature)': 0.001,\n",
" 'P(Finance)': 0.01,\n",
" 'P(Games)': 0.069,\n",
" 'P(Home_and_Garden)': 0.011,\n",
" 'P(Jobs_and_Education)': 0.001,\n",
" 'P(Arts_and_Entertainment)': 0.005,\n",
" 'P(Sensitive Subjects)': 0.003,\n",
" 'P(Real Estate)': 0.009,\n",
" 'P(Internet_and_Telecom)': 0.063,\n",
" 'P(Sports)': 0.014,\n",
" 'Predicted Label': 'Computers_and_Electronics',\n",
" 'Predicted Label Score': 1.0}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict('cat ear headphones')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df['Category']= df.Keyword.map(get_top_labels)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# import numpy as np\n",
"# for i in range(len(df.Category)):\n",
"# # print(type(df.Category[i]))\n",
"# # df.Category[i]= df.Category[i]['Predicted Label']\n",
"# # df.loc[i, 'Category']= f\"{df.Category[i]['Predicted Label']}, {str(np.round(df.Category[i]['Predicted Label Score'],2))}\"\n",
"# df.loc[i, 'Probablity Score']= f\"{str(np.round(df.Category[i]['Predicted Label Score'],2))}\"\n",
"# df.loc[i, 'Category']= f\"{df.Category[i]['Predicted Label']}\"\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Index | \n",
" Keyword | \n",
" Category | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" guide to headphones | \n",
" [(Computers_and_Electronics, 1.0)] | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" headphone guide | \n",
" [(Computers_and_Electronics, 0.999)] | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" buy headphones guide | \n",
" [(Shopping, 0.997), (Computers_and_Electronics... | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" choosing headphones guide | \n",
" [(Computers_and_Electronics, 1.0)] | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" sony headphones guide | \n",
" [(Computers_and_Electronics, 1.0)] | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 272 | \n",
" 273 | \n",
" guidelines 2022 | \n",
" [(Computers_and_Electronics, 1.0)] | \n",
"
\n",
" \n",
" 273 | \n",
" 274 | \n",
" guidelines 2023 | \n",
" [(Computers_and_Electronics, 1.0)] | \n",
"
\n",
" \n",
" 274 | \n",
" 275 | \n",
" ts guidelines | \n",
" [(Computers_and_Electronics, 0.933), (Games, 0... | \n",
"
\n",
" \n",
" 275 | \n",
" 276 | \n",
" guided drawing | \n",
" [(Reference, 0.995)] | \n",
"
\n",
" \n",
" 276 | \n",
" 277 | \n",
" guided meditation | \n",
" [(Beauty_and_Fitness, 1.0), (Hobbies_and_Leisu... | \n",
"
\n",
" \n",
"
\n",
"
277 rows × 3 columns
\n",
"
"
],
"text/plain": [
" Index Keyword \\\n",
"0 1 guide to headphones \n",
"1 2 headphone guide \n",
"2 3 buy headphones guide \n",
"3 4 choosing headphones guide \n",
"4 5 sony headphones guide \n",
".. ... ... \n",
"272 273 guidelines 2022 \n",
"273 274 guidelines 2023 \n",
"274 275 ts guidelines \n",
"275 276 guided drawing \n",
"276 277 guided meditation \n",
"\n",
" Category \n",
"0 [(Computers_and_Electronics, 1.0)] \n",
"1 [(Computers_and_Electronics, 0.999)] \n",
"2 [(Shopping, 0.997), (Computers_and_Electronics... \n",
"3 [(Computers_and_Electronics, 1.0)] \n",
"4 [(Computers_and_Electronics, 1.0)] \n",
".. ... \n",
"272 [(Computers_and_Electronics, 1.0)] \n",
"273 [(Computers_and_Electronics, 1.0)] \n",
"274 [(Computers_and_Electronics, 0.933), (Games, 0... \n",
"275 [(Reference, 0.995)] \n",
"276 [(Beauty_and_Fitness, 1.0), (Hobbies_and_Leisu... \n",
"\n",
"[277 rows x 3 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\n",
" 'data_test/labelled_data.csv',\n",
" index=False\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Index | \n",
" Keyword | \n",
" Category | \n",
" Probablity Score | \n",
"
\n",
" \n",
" \n",
" \n",
" 93 | \n",
" 94 | \n",
" reddit headphones buying guide | \n",
" Internet_and_Telecom | \n",
" 0.98 | \n",
"
\n",
" \n",
" 116 | \n",
" 117 | \n",
" turtle beach headset guide | \n",
" Pets_and_Animals | \n",
" 0.99 | \n",
"
\n",
" \n",
" 120 | \n",
" 121 | \n",
" earbuds buying guide reddit | \n",
" Online Communities | \n",
" 0.91 | \n",
"
\n",
" \n",
" 208 | \n",
" 209 | \n",
" guides | \n",
" Sports | \n",
" 0.7 | \n",
"
\n",
" \n",
" 224 | \n",
" 225 | \n",
" guide book | \n",
" Books_and_Literature | \n",
" 0.86 | \n",
"
\n",
" \n",
" 225 | \n",
" 226 | \n",
" guide for sale | \n",
" Shopping | \n",
" 1.0 | \n",
"
\n",
" \n",
" 226 | \n",
" 227 | \n",
" guide for school | \n",
" Jobs_and_Education | \n",
" 0.84 | \n",
"
\n",
" \n",
" 227 | \n",
" 228 | \n",
" guide for students | \n",
" Other | \n",
" 0.57 | \n",
"
\n",
" \n",
" 248 | \n",
" 249 | \n",
" guidebook | \n",
" Books_and_Literature | \n",
" 0.91 | \n",
"
\n",
" \n",
" 251 | \n",
" 252 | \n",
" guide reddit | \n",
" Online Communities | \n",
" 1.0 | \n",
"
\n",
" \n",
" 260 | \n",
" 261 | \n",
" hr guide interview questions | \n",
" Jobs_and_Education | \n",
" 1.0 | \n",
"
\n",
" \n",
" 262 | \n",
" 263 | \n",
" vi beginners guide | \n",
" Hobbies_and_Leisure | \n",
" 0.9 | \n",
"
\n",
" \n",
" 265 | \n",
" 266 | \n",
" guidelines | \n",
" Sensitive Subjects | \n",
" 1.0 | \n",
"
\n",
" \n",
" 270 | \n",
" 271 | \n",
" guided reading | \n",
" Books_and_Literature | \n",
" 1.0 | \n",
"
\n",
" \n",
" 271 | \n",
" 272 | \n",
" guided reading level | \n",
" Books_and_Literature | \n",
" 0.98 | \n",
"
\n",
" \n",
" 275 | \n",
" 276 | \n",
" guided drawing | \n",
" Other | \n",
" 0.56 | \n",
"
\n",
" \n",
" 276 | \n",
" 277 | \n",
" guided meditation | \n",
" Beauty_and_Fitness | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Index Keyword Category \\\n",
"93 94 reddit headphones buying guide Internet_and_Telecom \n",
"116 117 turtle beach headset guide Pets_and_Animals \n",
"120 121 earbuds buying guide reddit Online Communities \n",
"208 209 guides Sports \n",
"224 225 guide book Books_and_Literature \n",
"225 226 guide for sale Shopping \n",
"226 227 guide for school Jobs_and_Education \n",
"227 228 guide for students Other \n",
"248 249 guidebook Books_and_Literature \n",
"251 252 guide reddit Online Communities \n",
"260 261 hr guide interview questions Jobs_and_Education \n",
"262 263 vi beginners guide Hobbies_and_Leisure \n",
"265 266 guidelines Sensitive Subjects \n",
"270 271 guided reading Books_and_Literature \n",
"271 272 guided reading level Books_and_Literature \n",
"275 276 guided drawing Other \n",
"276 277 guided meditation Beauty_and_Fitness \n",
"\n",
" Probablity Score \n",
"93 0.98 \n",
"116 0.99 \n",
"120 0.91 \n",
"208 0.7 \n",
"224 0.86 \n",
"225 1.0 \n",
"226 0.84 \n",
"227 0.57 \n",
"248 0.91 \n",
"251 1.0 \n",
"260 1.0 \n",
"262 0.9 \n",
"265 1.0 \n",
"270 1.0 \n",
"271 0.98 \n",
"275 0.56 \n",
"276 1.0 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# df[df.Category!='Computers_and_Electronics']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Index | \n",
" Keyword | \n",
" Category | \n",
" Probablity Score | \n",
"
\n",
" \n",
" \n",
" \n",
" 227 | \n",
" 228 | \n",
" guide for students | \n",
" Other | \n",
" 0.57 | \n",
"
\n",
" \n",
" 275 | \n",
" 276 | \n",
" guided drawing | \n",
" Other | \n",
" 0.56 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Index Keyword Category Probablity Score\n",
"227 228 guide for students Other 0.57\n",
"275 276 guided drawing Other 0.56"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.Category=='Other']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Computers_and_Electronics', 1.0)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_top_labels(\n",
" 'turtle beach headset guide'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Predicted Class: Computers_and_Electronics \n",
"probabilities_scores: 0.9980000257492065\n",
"\n"
]
},
{
"data": {
"text/plain": [
"{'P(Hobbies_and_Leisure)': 0.221,\n",
" 'P(News)': 0.001,\n",
" 'P(Science)': 0.032,\n",
" 'P(Autos_and_Vehicles)': 0.013,\n",
" 'P(Health)': 0.004,\n",
" 'P(Pets_and_Animals)': 0.162,\n",
" 'P(Adult)': 0.013,\n",
" 'P(Computers_and_Electronics)': 0.998,\n",
" 'P(Online Communities)': 0.255,\n",
" 'P(Beauty_and_Fitness)': 0.016,\n",
" 'P(People_and_Society)': 0.0,\n",
" 'P(Business_and_Industrial)': 0.001,\n",
" 'P(Reference)': 0.003,\n",
" 'P(Shopping)': 0.083,\n",
" 'P(Travel_and_Transportation)': 0.006,\n",
" 'P(Food_and_Drink)': 0.01,\n",
" 'P(Law_and_Government)': 0.005,\n",
" 'P(Books_and_Literature)': 0.001,\n",
" 'P(Finance)': 0.006,\n",
" 'P(Games)': 0.045,\n",
" 'P(Home_and_Garden)': 0.021,\n",
" 'P(Jobs_and_Education)': 0.001,\n",
" 'P(Arts_and_Entertainment)': 0.004,\n",
" 'P(Sensitive Subjects)': 0.002,\n",
" 'P(Real Estate)': 0.009,\n",
" 'P(Internet_and_Telecom)': 0.029,\n",
" 'P(Sports)': 0.014,\n",
" 'Predicted Label': 'Computers_and_Electronics',\n",
" 'Predicted Label Score': 0.998}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict(\n",
" 'turtle beach headphones guide'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os; os.chdir('..')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
}
],
"source": [
"from utils.get_category import get_top_labels, predict"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Food_and_Drink', 0.989),\n",
" ('Computers_and_Electronics', 0.973),\n",
" ('Games', 0.172),\n",
" ('Shopping', 0.134)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_top_labels(\n",
" \"apple\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Computers_and_Electronics', 0.999), ('Shopping', 0.993)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_top_labels(\n",
" 'amazon mindkoo headsets with discount'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Home_and_Garden', 0.999), ('Computers_and_Electronics', 0.243)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_top_labels(\n",
" 'how to use lawn mower'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}