{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['id', 'NAME', 'host id', 'host name', 'neighbourhood group',\n", " 'neighbourhood', 'lat', 'long', 'country', 'country code',\n", " 'instant_bookable', 'cancellation_policy', 'room type',\n", " 'Construction year', 'price', 'service fee', 'minimum nights',\n", " 'number of reviews', 'last review', 'reviews per month',\n", " 'review rate number', 'calculated host listings count',\n", " 'availability 365', 'house_rules', 'license'],\n", " dtype='object')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\ardit\\AppData\\Local\\Temp\\ipykernel_25752\\2207992772.py:4: DtypeWarning: Columns (25) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv('Airbnb_Open_Data.csv')\n" ] } ], "source": [ "import pandas as pd\n", "import random\n", "\n", "df = pd.read_csv('Airbnb_Open_Data.csv')\n", "df = df.drop('host_identity_verified', axis=1)\n", "df['description'] = df['NAME']\n", "df['price'] = df['price'].dropna().apply(lambda x : int(x[1:].strip().replace(',', '')))\n", "df['sq. meters'] = df['price'].apply(lambda x : random.choices([25, 40, 45, 55, 60, 70], weights=[5, 5, 4, 3, 2, 1])[0])\n", "df = df[['price', 'sq. meters', 'description', 'neighbourhood group', 'host name', 'cancellation_policy', 'house_rules']]\n", "df = df[df['house_rules']!='#NAME?'].dropna().reset_index(drop=True)\n", "df = df[0:10000]" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 10000/10000 [17:37<00:00, 9.45it/s]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pricesq. metersdescriptionneighbourhood grouphost namecancellation_policyhouse_rulestext_vector_
0966.025Clean & quiet apt home by the parkBrooklynMadalinestrictClean up and treat the home the way you'd like...[-0.047521110624074936, 0.03044620156288147, 0...
1142.025Skylit Midtown CastleManhattanJennamoderatePet friendly but please confirm with me if the...[-0.04690079391002655, 0.061329323798418045, 0...
2620.045THE VILLAGE OF HARLEM....NEW YORK !ManhattanEliseflexibleI encourage you to use my kitchen, cooking and...[0.00039011164335533977, 0.018310122191905975,...
3204.055Entire Apt: Spacious Studio/Loft by central parkManhattanLyndonmoderatePlease no smoking in the house, porch or on th...[-0.04602213576436043, 0.015605293214321136, 0...
4577.025Large Cozy 1 BR Apartment In Midtown EastManhattanMichelleflexibleNo smoking, please, and no drugs.[-0.04859349876642227, -0.01263828668743372, 0...
...........................
9995745.060Upper West Side 1BR next to subway/Central ParkManhattanDoreenstrictOur Herbivorian House manual with detailed rul...[-0.0346745029091835, -0.005859952419996262, 0...
99961135.045Modern and Bright Studio Apt in WilliamsburgBrooklynShannonstrictNo smoking please![-0.016586357727646828, 0.020517650991678238, ...
999759.045Holiday in Trendy Williamsburg Apt!BrooklynPeterstrictWe suggest you use email or texting contact us...[-0.05095353722572327, 0.08510775864124298, -0...
99981055.025Greenwich Village| Private Queen roomManhattanKellyflexiblePlease treat this house as if it is your own. ...[0.00017118529649451375, 0.010939894244074821,...
9999285.025Comfortable bedroom in spacious aptBrooklynArthurstrictPlease, No smoking and no pets. We do require ...[-0.01795135624706745, -0.029596544802188873, ...
\n", "

10000 rows × 8 columns

\n", "
" ], "text/plain": [ " price sq. meters description \\\n", "0 966.0 25 Clean & quiet apt home by the park \n", "1 142.0 25 Skylit Midtown Castle \n", "2 620.0 45 THE VILLAGE OF HARLEM....NEW YORK ! \n", "3 204.0 55 Entire Apt: Spacious Studio/Loft by central park \n", "4 577.0 25 Large Cozy 1 BR Apartment In Midtown East \n", "... ... ... ... \n", "9995 745.0 60 Upper West Side 1BR next to subway/Central Park \n", "9996 1135.0 45 Modern and Bright Studio Apt in Williamsburg \n", "9997 59.0 45 Holiday in Trendy Williamsburg Apt! \n", "9998 1055.0 25 Greenwich Village| Private Queen room \n", "9999 285.0 25 Comfortable bedroom in spacious apt \n", "\n", " neighbourhood group host name cancellation_policy \\\n", "0 Brooklyn Madaline strict \n", "1 Manhattan Jenna moderate \n", "2 Manhattan Elise flexible \n", "3 Manhattan Lyndon moderate \n", "4 Manhattan Michelle flexible \n", "... ... ... ... \n", "9995 Manhattan Doreen strict \n", "9996 Brooklyn Shannon strict \n", "9997 Brooklyn Peter strict \n", "9998 Manhattan Kelly flexible \n", "9999 Brooklyn Arthur strict \n", "\n", " house_rules \\\n", "0 Clean up and treat the home the way you'd like... \n", "1 Pet friendly but please confirm with me if the... \n", "2 I encourage you to use my kitchen, cooking and... \n", "3 Please no smoking in the house, porch or on th... \n", "4 No smoking, please, and no drugs. \n", "... ... \n", "9995 Our Herbivorian House manual with detailed rul... \n", "9996 No smoking please! \n", "9997 We suggest you use email or texting contact us... \n", "9998 Please treat this house as if it is your own. ... \n", "9999 Please, No smoking and no pets. We do require ... \n", "\n", " text_vector_ \n", "0 [-0.047521110624074936, 0.03044620156288147, 0... \n", "1 [-0.04690079391002655, 0.061329323798418045, 0... \n", "2 [0.00039011164335533977, 0.018310122191905975,... \n", "3 [-0.04602213576436043, 0.015605293214321136, 0... \n", "4 [-0.04859349876642227, -0.01263828668743372, 0... \n", "... ... \n", "9995 [-0.0346745029091835, -0.005859952419996262, 0... \n", "9996 [-0.016586357727646828, 0.020517650991678238, ... \n", "9997 [-0.05095353722572327, 0.08510775864124298, -0... \n", "9998 [0.00017118529649451375, 0.010939894244074821,... \n", "9999 [-0.01795135624706745, -0.029596544802188873, ... \n", "\n", "[10000 rows x 8 columns]" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "from tqdm import tqdm\n", "from sentence_transformers import SentenceTransformer\n", "tqdm.pandas()\n", "\n", "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n", "\n", "#encode df version: for small dataset only\n", "df['text_vector_'] = df['description'].progress_apply(lambda x : model.encode(x).tolist())\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet('df_encoded.parquet')\n", "df['neighbourhood group'][0:2500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Manhattan')\n", "df['neighbourhood group'][2500:5000] = df['neighbourhood group'][0:2500].apply(lambda x : 'Brooklyn')\n", "df['neighbourhood group'][5000:7500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Queens')\n", "df['neighbourhood group'][7500:] = df['neighbourhood group'][0:2500].apply(lambda x : 'Bronx')\n", "df['location'] = df['neighbourhood group']\n", "df = df[['price', 'sq. meters', 'description', 'location', 'host name', 'cancellation_policy', 'house_rules', 'text_vector_']]\n", "df = df.reset_index(drop=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [], "source": [ "from sklearn.neighbors import NearestNeighbors\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from sentence_transformers import SentenceTransformer\n", "\n", "# df = df.read_parquet('df_encoded.parquet')\n", "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n", "\n", "#prepare model\n", "# nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import gradio as gr\n", "import statistics\n", "\n", "def closest_number(x):\n", " closest_numbers = [10, 20, 30, 40]\n", " closest_number = closest_numbers[0]\n", " min_distance = abs(x - closest_number)\n", " for number in closest_numbers[1:]:\n", " distance = abs(x - number)\n", " if distance < min_distance:\n", " closest_number = number\n", " min_distance = distance\n", " return closest_number\n", "\n", "def search(df, query):\n", " product = model.encode(query).tolist()\n", " # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n", "\n", " nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n", " distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n", "\n", " #print out the description of every recommended product\n", " df_search = df.iloc[list(indices)[0]].drop(['text_vector_'], axis=1) #.sort_values('avgFeedbackScore', ascending=False)\n", "\n", " return df_search.sort_values('price', ascending=False)\n", "\n", "def filter_df(df, column_name, filter_type, filter_value):\n", " if filter_type == '==':\n", " df_filtered = df[df[column_name]==filter_value]\n", " elif filter_type == '>=':\n", " df_filtered = df[df[column_name]>=filter_value]\n", " elif filter_type == '<=':\n", " df_filtered = df[df[column_name]<=filter_value]\n", " return df_filtered\n", "\n", "history = list()\n", "def predict(input1, input2, input3, input4):\n", " history.append([input1, input2, input3, input4])\n", "\n", " print(history)\n", " df_location = filter_df(df, 'location', '==', input3)\n", " df_size = filter_df(df_location, 'sq. meters', '==', input2)\n", " df_price = filter_df(df_size, 'price', '<=', input1)\n", " df_result = search(df_price, input4)\n", "\n", " prediction = [\n", " round(statistics.mean([x[0] for x in history])), #price\n", " closest_number(statistics.mean([x[1] for x in history])), #square room\n", " statistics.mode([x[2] for x in history]) #state\n", " ]\n", "\n", " return df_result, prediction\n", "\n", "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n", " gr.Markdown(\n", " \"\"\"\n", " # Airbnb Search Engine\n", " \"\"\"\n", " )\n", " input1 = gr.Slider(100, 1200, value=700, step_size=100, label=\"Max Price\")\n", " input2 = gr.Radio([25, 40, 45, 55, 60, 70], multiselect=False, label='square meters', value=45)\n", " input3 = gr.Radio(['Manhattan', 'Brooklyn', 'Queens', 'Bronx'], multiselect=False, label='State', value='Queens')\n", " input4 = gr.Textbox(label='Query', value='I want to take a break from work 😴!!!')\n", "\n", " btn = gr.Button(value=\"Search for a Room\")\n", " output1 = gr.Dataframe()\n", " output2 = gr.Textbox(label='prediction for the next search')\n", " # btn.click(greet, inputs='text', outputs=['dataframe'])\n", " btn.click(predict, [input1, input2, input3, input4], [output1, output2])\n", "demo.launch(share=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.system('pip install openpyxl')\n", "os.system('pip install sentence-transformers')\n", "import pandas as pd\n", "import gradio as gr\n", "import statistics\n", "from sklearn.neighbors import NearestNeighbors\n", "from sentence_transformers import SentenceTransformer\n", "\n", "df = pd.read_parquet('df_encoded.parquet')\n", "df['neighbourhood group'][0:2500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Manhattan')\n", "df['neighbourhood group'][2500:5000] = df['neighbourhood group'][0:2500].apply(lambda x : 'Brooklyn')\n", "df['neighbourhood group'][5000:7500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Queens')\n", "df['neighbourhood group'][7500:] = df['neighbourhood group'][0:2500].apply(lambda x : 'Bronx')\n", "df['location'] = df['neighbourhood group']\n", "df = df[['price', 'sq. meters', 'description', 'location', 'host name', 'cancellation_policy', 'house_rules', 'text_vector_']]\n", "df = df.reset_index(drop=True)\n", "df\n", "\n", "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n", "\n", "#prepare model #we run it anew in the search function every time, after the initial filtering\n", "# nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n", "\n", "def closest_number(x):\n", " closest_numbers = [25, 40, 45, 55, 60, 70]\n", " closest_number = closest_numbers[0]\n", " min_distance = abs(x - closest_number)\n", " for number in closest_numbers[1:]:\n", " distance = abs(x - number)\n", " if distance < min_distance:\n", " closest_number = number\n", " min_distance = distance\n", " return closest_number\n", "\n", "def search(df, query):\n", " product = model.encode(query).tolist()\n", " # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n", "\n", " nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n", " distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n", "\n", " #print out the description of every recommended product\n", " df_search = df.iloc[list(indices)[0]].drop(['text_vector_'], axis=1) #.sort_values('avgFeedbackScore', ascending=False)\n", "\n", " return df_search.sort_values('price', ascending=False)\n", "\n", "def filter_df(df, column_name, filter_type, filter_value):\n", " if filter_type == '==':\n", " df_filtered = df[df[column_name]==filter_value]\n", " elif filter_type == '>=':\n", " df_filtered = df[df[column_name]>=filter_value]\n", " elif filter_type == '<=':\n", " df_filtered = df[df[column_name]<=filter_value]\n", " return df_filtered" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def predict(history, input1, input2, input3, input4):\n", " history.append([input1, input2, input3, input4])\n", "\n", " print(history)\n", " df_location = filter_df(df, 'location', '==', input3)\n", " df_size = filter_df(df_location, 'sq. meters', '==', input2)\n", " df_price = filter_df(df_size, 'price', '<=', input1)\n", " df_result = search(df_price, input4)\n", "\n", " prediction = [\n", " round(statistics.mean([x[0] for x in history])), #price\n", " closest_number(statistics.mean([x[1] for x in history])), #square meters\n", " statistics.mode([x[2] for x in history]) #state\n", " ]\n", "\n", " print(history)\n", "\n", " return df_result, prediction" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Slider, please remove them: {'step_size': 100}\n", " warnings.warn(\n", "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'multiselect': False}\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7863\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "[[700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!']]\n", "[[700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!']]\n", "[[700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!'], [700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!']]\n", "[[700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!'], [700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!']]\n" ] } ], "source": [ "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n", " history = gr.Variable(value=[]) #beginning\n", " gr.Markdown(\n", " \"\"\"\n", " # Airbnb Search Engine\n", " \"\"\"\n", " )\n", " input1 = gr.Slider(100, 1200, value=700, step_size=100, label=\"Max Price\")\n", " input2 = gr.Radio([25, 40, 45, 55, 60, 70], multiselect=False, label='square meters', value=45)\n", " input3 = gr.Radio(['Manhattan', 'Brooklyn', 'Queens', 'Bronx'], multiselect=False, label='State', value='Brooklyn')\n", " input4 = gr.Textbox(label='Query', value='I want to take a break from work 😴!!!')\n", "\n", " btn = gr.Button(value=\"Search for a Room\")\n", " output1 = gr.Dataframe()\n", " output2 = gr.Textbox(label='prediction for the next search')\n", " # btn.click(greet, inputs='text', outputs=['dataframe'])\n", " btn.click(predict, [history, input1, input2, input3, input4], [output1, output2])\n", "demo.launch(share=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }