Michelangiolo commited on
Commit
00bf920
1 Parent(s): f7ba55e

first push

Browse files
Files changed (4) hide show
  1. Airbnb_Open_Data.csv +0 -0
  2. airbnb.ipynb +486 -0
  3. app.py +91 -0
  4. df_encoded.parquet +3 -0
Airbnb_Open_Data.csv ADDED
The diff for this file is too large to render. See raw diff
 
airbnb.ipynb ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "Index(['id', 'NAME', 'host id', 'host name', 'neighbourhood group',\n",
12
+ " 'neighbourhood', 'lat', 'long', 'country', 'country code',\n",
13
+ " 'instant_bookable', 'cancellation_policy', 'room type',\n",
14
+ " 'Construction year', 'price', 'service fee', 'minimum nights',\n",
15
+ " 'number of reviews', 'last review', 'reviews per month',\n",
16
+ " 'review rate number', 'calculated host listings count',\n",
17
+ " 'availability 365', 'house_rules', 'license'],\n",
18
+ " dtype='object')"
19
+ ]
20
+ },
21
+ "execution_count": 3,
22
+ "metadata": {},
23
+ "output_type": "execute_result"
24
+ }
25
+ ],
26
+ "source": [
27
+ "df.columns"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 71,
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stderr",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "C:\\Users\\ardit\\AppData\\Local\\Temp\\ipykernel_25752\\2207992772.py:4: DtypeWarning: Columns (25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
40
+ " df = pd.read_csv('Airbnb_Open_Data.csv')\n"
41
+ ]
42
+ }
43
+ ],
44
+ "source": [
45
+ "import pandas as pd\n",
46
+ "import random\n",
47
+ "\n",
48
+ "df = pd.read_csv('Airbnb_Open_Data.csv')\n",
49
+ "df = df.drop('host_identity_verified', axis=1)\n",
50
+ "df['description'] = df['NAME']\n",
51
+ "df['price'] = df['price'].dropna().apply(lambda x : int(x[1:].strip().replace(',', '')))\n",
52
+ "df['sq. meters'] = df['price'].apply(lambda x : random.choices([25, 40, 45, 55, 60, 70], weights=[5, 5, 4, 3, 2, 1])[0])\n",
53
+ "df = df[['price', 'sq. meters', 'description', 'neighbourhood group', 'host name', 'cancellation_policy', 'house_rules']]\n",
54
+ "df = df[df['house_rules']!='#NAME?'].dropna().reset_index(drop=True)\n",
55
+ "df = df[0:10000]"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 72,
61
+ "metadata": {},
62
+ "outputs": [
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "100%|██████████| 10000/10000 [17:37<00:00, 9.45it/s]\n"
68
+ ]
69
+ },
70
+ {
71
+ "data": {
72
+ "text/html": [
73
+ "<div>\n",
74
+ "<style scoped>\n",
75
+ " .dataframe tbody tr th:only-of-type {\n",
76
+ " vertical-align: middle;\n",
77
+ " }\n",
78
+ "\n",
79
+ " .dataframe tbody tr th {\n",
80
+ " vertical-align: top;\n",
81
+ " }\n",
82
+ "\n",
83
+ " .dataframe thead th {\n",
84
+ " text-align: right;\n",
85
+ " }\n",
86
+ "</style>\n",
87
+ "<table border=\"1\" class=\"dataframe\">\n",
88
+ " <thead>\n",
89
+ " <tr style=\"text-align: right;\">\n",
90
+ " <th></th>\n",
91
+ " <th>price</th>\n",
92
+ " <th>sq. meters</th>\n",
93
+ " <th>description</th>\n",
94
+ " <th>neighbourhood group</th>\n",
95
+ " <th>host name</th>\n",
96
+ " <th>cancellation_policy</th>\n",
97
+ " <th>house_rules</th>\n",
98
+ " <th>text_vector_</th>\n",
99
+ " </tr>\n",
100
+ " </thead>\n",
101
+ " <tbody>\n",
102
+ " <tr>\n",
103
+ " <th>0</th>\n",
104
+ " <td>966.0</td>\n",
105
+ " <td>25</td>\n",
106
+ " <td>Clean &amp; quiet apt home by the park</td>\n",
107
+ " <td>Brooklyn</td>\n",
108
+ " <td>Madaline</td>\n",
109
+ " <td>strict</td>\n",
110
+ " <td>Clean up and treat the home the way you'd like...</td>\n",
111
+ " <td>[-0.047521110624074936, 0.03044620156288147, 0...</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>1</th>\n",
115
+ " <td>142.0</td>\n",
116
+ " <td>25</td>\n",
117
+ " <td>Skylit Midtown Castle</td>\n",
118
+ " <td>Manhattan</td>\n",
119
+ " <td>Jenna</td>\n",
120
+ " <td>moderate</td>\n",
121
+ " <td>Pet friendly but please confirm with me if the...</td>\n",
122
+ " <td>[-0.04690079391002655, 0.061329323798418045, 0...</td>\n",
123
+ " </tr>\n",
124
+ " <tr>\n",
125
+ " <th>2</th>\n",
126
+ " <td>620.0</td>\n",
127
+ " <td>45</td>\n",
128
+ " <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n",
129
+ " <td>Manhattan</td>\n",
130
+ " <td>Elise</td>\n",
131
+ " <td>flexible</td>\n",
132
+ " <td>I encourage you to use my kitchen, cooking and...</td>\n",
133
+ " <td>[0.00039011164335533977, 0.018310122191905975,...</td>\n",
134
+ " </tr>\n",
135
+ " <tr>\n",
136
+ " <th>3</th>\n",
137
+ " <td>204.0</td>\n",
138
+ " <td>55</td>\n",
139
+ " <td>Entire Apt: Spacious Studio/Loft by central park</td>\n",
140
+ " <td>Manhattan</td>\n",
141
+ " <td>Lyndon</td>\n",
142
+ " <td>moderate</td>\n",
143
+ " <td>Please no smoking in the house, porch or on th...</td>\n",
144
+ " <td>[-0.04602213576436043, 0.015605293214321136, 0...</td>\n",
145
+ " </tr>\n",
146
+ " <tr>\n",
147
+ " <th>4</th>\n",
148
+ " <td>577.0</td>\n",
149
+ " <td>25</td>\n",
150
+ " <td>Large Cozy 1 BR Apartment In Midtown East</td>\n",
151
+ " <td>Manhattan</td>\n",
152
+ " <td>Michelle</td>\n",
153
+ " <td>flexible</td>\n",
154
+ " <td>No smoking, please, and no drugs.</td>\n",
155
+ " <td>[-0.04859349876642227, -0.01263828668743372, 0...</td>\n",
156
+ " </tr>\n",
157
+ " <tr>\n",
158
+ " <th>...</th>\n",
159
+ " <td>...</td>\n",
160
+ " <td>...</td>\n",
161
+ " <td>...</td>\n",
162
+ " <td>...</td>\n",
163
+ " <td>...</td>\n",
164
+ " <td>...</td>\n",
165
+ " <td>...</td>\n",
166
+ " <td>...</td>\n",
167
+ " </tr>\n",
168
+ " <tr>\n",
169
+ " <th>9995</th>\n",
170
+ " <td>745.0</td>\n",
171
+ " <td>60</td>\n",
172
+ " <td>Upper West Side 1BR next to subway/Central Park</td>\n",
173
+ " <td>Manhattan</td>\n",
174
+ " <td>Doreen</td>\n",
175
+ " <td>strict</td>\n",
176
+ " <td>Our Herbivorian House manual with detailed rul...</td>\n",
177
+ " <td>[-0.0346745029091835, -0.005859952419996262, 0...</td>\n",
178
+ " </tr>\n",
179
+ " <tr>\n",
180
+ " <th>9996</th>\n",
181
+ " <td>1135.0</td>\n",
182
+ " <td>45</td>\n",
183
+ " <td>Modern and Bright Studio Apt in Williamsburg</td>\n",
184
+ " <td>Brooklyn</td>\n",
185
+ " <td>Shannon</td>\n",
186
+ " <td>strict</td>\n",
187
+ " <td>No smoking please!</td>\n",
188
+ " <td>[-0.016586357727646828, 0.020517650991678238, ...</td>\n",
189
+ " </tr>\n",
190
+ " <tr>\n",
191
+ " <th>9997</th>\n",
192
+ " <td>59.0</td>\n",
193
+ " <td>45</td>\n",
194
+ " <td>Holiday in Trendy Williamsburg Apt!</td>\n",
195
+ " <td>Brooklyn</td>\n",
196
+ " <td>Peter</td>\n",
197
+ " <td>strict</td>\n",
198
+ " <td>We suggest you use email or texting contact us...</td>\n",
199
+ " <td>[-0.05095353722572327, 0.08510775864124298, -0...</td>\n",
200
+ " </tr>\n",
201
+ " <tr>\n",
202
+ " <th>9998</th>\n",
203
+ " <td>1055.0</td>\n",
204
+ " <td>25</td>\n",
205
+ " <td>Greenwich Village| Private Queen room</td>\n",
206
+ " <td>Manhattan</td>\n",
207
+ " <td>Kelly</td>\n",
208
+ " <td>flexible</td>\n",
209
+ " <td>Please treat this house as if it is your own. ...</td>\n",
210
+ " <td>[0.00017118529649451375, 0.010939894244074821,...</td>\n",
211
+ " </tr>\n",
212
+ " <tr>\n",
213
+ " <th>9999</th>\n",
214
+ " <td>285.0</td>\n",
215
+ " <td>25</td>\n",
216
+ " <td>Comfortable bedroom in spacious apt</td>\n",
217
+ " <td>Brooklyn</td>\n",
218
+ " <td>Arthur</td>\n",
219
+ " <td>strict</td>\n",
220
+ " <td>Please, No smoking and no pets. We do require ...</td>\n",
221
+ " <td>[-0.01795135624706745, -0.029596544802188873, ...</td>\n",
222
+ " </tr>\n",
223
+ " </tbody>\n",
224
+ "</table>\n",
225
+ "<p>10000 rows × 8 columns</p>\n",
226
+ "</div>"
227
+ ],
228
+ "text/plain": [
229
+ " price sq. meters description \\\n",
230
+ "0 966.0 25 Clean & quiet apt home by the park \n",
231
+ "1 142.0 25 Skylit Midtown Castle \n",
232
+ "2 620.0 45 THE VILLAGE OF HARLEM....NEW YORK ! \n",
233
+ "3 204.0 55 Entire Apt: Spacious Studio/Loft by central park \n",
234
+ "4 577.0 25 Large Cozy 1 BR Apartment In Midtown East \n",
235
+ "... ... ... ... \n",
236
+ "9995 745.0 60 Upper West Side 1BR next to subway/Central Park \n",
237
+ "9996 1135.0 45 Modern and Bright Studio Apt in Williamsburg \n",
238
+ "9997 59.0 45 Holiday in Trendy Williamsburg Apt! \n",
239
+ "9998 1055.0 25 Greenwich Village| Private Queen room \n",
240
+ "9999 285.0 25 Comfortable bedroom in spacious apt \n",
241
+ "\n",
242
+ " neighbourhood group host name cancellation_policy \\\n",
243
+ "0 Brooklyn Madaline strict \n",
244
+ "1 Manhattan Jenna moderate \n",
245
+ "2 Manhattan Elise flexible \n",
246
+ "3 Manhattan Lyndon moderate \n",
247
+ "4 Manhattan Michelle flexible \n",
248
+ "... ... ... ... \n",
249
+ "9995 Manhattan Doreen strict \n",
250
+ "9996 Brooklyn Shannon strict \n",
251
+ "9997 Brooklyn Peter strict \n",
252
+ "9998 Manhattan Kelly flexible \n",
253
+ "9999 Brooklyn Arthur strict \n",
254
+ "\n",
255
+ " house_rules \\\n",
256
+ "0 Clean up and treat the home the way you'd like... \n",
257
+ "1 Pet friendly but please confirm with me if the... \n",
258
+ "2 I encourage you to use my kitchen, cooking and... \n",
259
+ "3 Please no smoking in the house, porch or on th... \n",
260
+ "4 No smoking, please, and no drugs. \n",
261
+ "... ... \n",
262
+ "9995 Our Herbivorian House manual with detailed rul... \n",
263
+ "9996 No smoking please! \n",
264
+ "9997 We suggest you use email or texting contact us... \n",
265
+ "9998 Please treat this house as if it is your own. ... \n",
266
+ "9999 Please, No smoking and no pets. We do require ... \n",
267
+ "\n",
268
+ " text_vector_ \n",
269
+ "0 [-0.047521110624074936, 0.03044620156288147, 0... \n",
270
+ "1 [-0.04690079391002655, 0.061329323798418045, 0... \n",
271
+ "2 [0.00039011164335533977, 0.018310122191905975,... \n",
272
+ "3 [-0.04602213576436043, 0.015605293214321136, 0... \n",
273
+ "4 [-0.04859349876642227, -0.01263828668743372, 0... \n",
274
+ "... ... \n",
275
+ "9995 [-0.0346745029091835, -0.005859952419996262, 0... \n",
276
+ "9996 [-0.016586357727646828, 0.020517650991678238, ... \n",
277
+ "9997 [-0.05095353722572327, 0.08510775864124298, -0... \n",
278
+ "9998 [0.00017118529649451375, 0.010939894244074821,... \n",
279
+ "9999 [-0.01795135624706745, -0.029596544802188873, ... \n",
280
+ "\n",
281
+ "[10000 rows x 8 columns]"
282
+ ]
283
+ },
284
+ "execution_count": 72,
285
+ "metadata": {},
286
+ "output_type": "execute_result"
287
+ }
288
+ ],
289
+ "source": [
290
+ "import pandas as pd\n",
291
+ "from tqdm import tqdm\n",
292
+ "from sentence_transformers import SentenceTransformer\n",
293
+ "tqdm.pandas()\n",
294
+ "\n",
295
+ "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
296
+ "\n",
297
+ "#encode df version: for small dataset only\n",
298
+ "df['text_vector_'] = df['description'].progress_apply(lambda x : model.encode(x).tolist())\n",
299
+ "df"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": null,
305
+ "metadata": {},
306
+ "outputs": [],
307
+ "source": [
308
+ "df = pd.read_parquet('df_encoded.parquet')\n",
309
+ "df['neighbourhood group'][0:2500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Manhattan')\n",
310
+ "df['neighbourhood group'][2500:5000] = df['neighbourhood group'][0:2500].apply(lambda x : 'Brooklyn')\n",
311
+ "df['neighbourhood group'][5000:7500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Queens')\n",
312
+ "df['neighbourhood group'][7500:] = df['neighbourhood group'][0:2500].apply(lambda x : 'Bronx')\n",
313
+ "df['location'] = df['neighbourhood group']\n",
314
+ "df = df[['price', 'sq. meters', 'description', 'location', 'host name', 'cancellation_policy', 'house_rules', 'text_vector_']]\n",
315
+ "df = df.reset_index(drop=True)\n",
316
+ "df"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": 145,
322
+ "metadata": {},
323
+ "outputs": [],
324
+ "source": [
325
+ "from sklearn.neighbors import NearestNeighbors\n",
326
+ "import numpy as np\n",
327
+ "import pandas as pd\n",
328
+ "\n",
329
+ "from sentence_transformers import SentenceTransformer\n",
330
+ "\n",
331
+ "# df = df.read_parquet('df_encoded.parquet')\n",
332
+ "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
333
+ "\n",
334
+ "#prepare model\n",
335
+ "# nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())"
336
+ ]
337
+ },
338
+ {
339
+ "cell_type": "code",
340
+ "execution_count": 213,
341
+ "metadata": {},
342
+ "outputs": [
343
+ {
344
+ "name": "stderr",
345
+ "output_type": "stream",
346
+ "text": [
347
+ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Slider, please remove them: {'step_size': 100}\n",
348
+ " warnings.warn(\n",
349
+ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'multiselect': False}\n",
350
+ " warnings.warn(\n"
351
+ ]
352
+ },
353
+ {
354
+ "name": "stdout",
355
+ "output_type": "stream",
356
+ "text": [
357
+ "Running on local URL: http://127.0.0.1:7901\n",
358
+ "\n",
359
+ "To create a public link, set `share=True` in `launch()`.\n"
360
+ ]
361
+ },
362
+ {
363
+ "data": {
364
+ "text/html": [
365
+ "<div><iframe src=\"http://127.0.0.1:7901/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
366
+ ],
367
+ "text/plain": [
368
+ "<IPython.core.display.HTML object>"
369
+ ]
370
+ },
371
+ "metadata": {},
372
+ "output_type": "display_data"
373
+ },
374
+ {
375
+ "data": {
376
+ "text/plain": []
377
+ },
378
+ "execution_count": 213,
379
+ "metadata": {},
380
+ "output_type": "execute_result"
381
+ },
382
+ {
383
+ "name": "stdout",
384
+ "output_type": "stream",
385
+ "text": [
386
+ "[[700, 45, 'Queens', 'I want to take a break from work 😴!!!']]\n"
387
+ ]
388
+ }
389
+ ],
390
+ "source": [
391
+ "import gradio as gr\n",
392
+ "import statistics\n",
393
+ "\n",
394
+ "def closest_number(x):\n",
395
+ " closest_numbers = [10, 20, 30, 40]\n",
396
+ " closest_number = closest_numbers[0]\n",
397
+ " min_distance = abs(x - closest_number)\n",
398
+ " for number in closest_numbers[1:]:\n",
399
+ " distance = abs(x - number)\n",
400
+ " if distance < min_distance:\n",
401
+ " closest_number = number\n",
402
+ " min_distance = distance\n",
403
+ " return closest_number\n",
404
+ "\n",
405
+ "def search(df, query):\n",
406
+ " product = model.encode(query).tolist()\n",
407
+ " # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n",
408
+ "\n",
409
+ " nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n",
410
+ " distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n",
411
+ "\n",
412
+ " #print out the description of every recommended product\n",
413
+ " df_search = df.iloc[list(indices)[0]].drop(['text_vector_'], axis=1) #.sort_values('avgFeedbackScore', ascending=False)\n",
414
+ "\n",
415
+ " return df_search.sort_values('price', ascending=False)\n",
416
+ "\n",
417
+ "def filter_df(df, column_name, filter_type, filter_value):\n",
418
+ " if filter_type == '==':\n",
419
+ " df_filtered = df[df[column_name]==filter_value]\n",
420
+ " elif filter_type == '>=':\n",
421
+ " df_filtered = df[df[column_name]>=filter_value]\n",
422
+ " elif filter_type == '<=':\n",
423
+ " df_filtered = df[df[column_name]<=filter_value]\n",
424
+ " return df_filtered\n",
425
+ "\n",
426
+ "history = list()\n",
427
+ "def predict(input1, input2, input3, input4):\n",
428
+ " history.append([input1, input2, input3, input4])\n",
429
+ "\n",
430
+ " print(history)\n",
431
+ " df_location = filter_df(df, 'location', '==', input3)\n",
432
+ " df_size = filter_df(df_location, 'sq. meters', '==', input2)\n",
433
+ " df_price = filter_df(df_size, 'price', '<=', input1)\n",
434
+ " df_result = search(df_price, input4)\n",
435
+ "\n",
436
+ " prediction = [\n",
437
+ " round(statistics.mean([x[0] for x in history])), #price\n",
438
+ " closest_number(statistics.mean([x[1] for x in history])), #square room\n",
439
+ " statistics.mode([x[2] for x in history]) #state\n",
440
+ " ]\n",
441
+ "\n",
442
+ " return df_result, prediction\n",
443
+ "\n",
444
+ "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n",
445
+ " gr.Markdown(\n",
446
+ " \"\"\"\n",
447
+ " # Airbnb Search Engine\n",
448
+ " \"\"\"\n",
449
+ " )\n",
450
+ " input1 = gr.Slider(100, 1200, value=700, step_size=100, label=\"Max Price\")\n",
451
+ " input2 = gr.Radio([25, 40, 45, 55, 60, 70], multiselect=False, label='square meters', value=45)\n",
452
+ " input3 = gr.Radio(['Manhattan', 'Brooklyn', 'Queens', 'Bronx'], multiselect=False, label='State', value='Queens')\n",
453
+ " input4 = gr.Textbox(label='Query', value='I want to take a break from work 😴!!!')\n",
454
+ "\n",
455
+ " btn = gr.Button(value=\"Search for a Room\")\n",
456
+ " output1 = gr.Dataframe()\n",
457
+ " output2 = gr.Textbox(label='prediction for the next search')\n",
458
+ " # btn.click(greet, inputs='text', outputs=['dataframe'])\n",
459
+ " btn.click(predict, [input1, input2, input3, input4], [output1, output2])\n",
460
+ "demo.launch(share=False)"
461
+ ]
462
+ }
463
+ ],
464
+ "metadata": {
465
+ "kernelspec": {
466
+ "display_name": "Python 3",
467
+ "language": "python",
468
+ "name": "python3"
469
+ },
470
+ "language_info": {
471
+ "codemirror_mode": {
472
+ "name": "ipython",
473
+ "version": 3
474
+ },
475
+ "file_extension": ".py",
476
+ "mimetype": "text/x-python",
477
+ "name": "python",
478
+ "nbconvert_exporter": "python",
479
+ "pygments_lexer": "ipython3",
480
+ "version": "3.9.13"
481
+ },
482
+ "orig_nbformat": 4
483
+ },
484
+ "nbformat": 4,
485
+ "nbformat_minor": 2
486
+ }
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system('pip install openpyxl')
3
+ os.system('pip install sentence-transformers')
4
+ import pandas as pd
5
+ import gradio as gr
6
+ import statistics
7
+ from sklearn.neighbors import NearestNeighbors
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ df = pd.read_parquet('df_encoded.parquet')
11
+ df['neighbourhood group'][0:2500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Manhattan')
12
+ df['neighbourhood group'][2500:5000] = df['neighbourhood group'][0:2500].apply(lambda x : 'Brooklyn')
13
+ df['neighbourhood group'][5000:7500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Queens')
14
+ df['neighbourhood group'][7500:] = df['neighbourhood group'][0:2500].apply(lambda x : 'Bronx')
15
+ df['location'] = df['neighbourhood group']
16
+ df = df[['price', 'sq. meters', 'description', 'location', 'host name', 'cancellation_policy', 'house_rules', 'text_vector_']]
17
+ df = df.reset_index(drop=True)
18
+ df
19
+
20
+ model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
21
+
22
+ #prepare model #we run it anew in the search function every time, after the initial filtering
23
+ # nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
24
+
25
+ def closest_number(x):
26
+ closest_numbers = [10, 20, 30, 40]
27
+ closest_number = closest_numbers[0]
28
+ min_distance = abs(x - closest_number)
29
+ for number in closest_numbers[1:]:
30
+ distance = abs(x - number)
31
+ if distance < min_distance:
32
+ closest_number = number
33
+ min_distance = distance
34
+ return closest_number
35
+
36
+ def search(df, query):
37
+ product = model.encode(query).tolist()
38
+ # product = df.iloc[0]['text_vector_'] #use one of the products as sample
39
+
40
+ nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
41
+ distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
42
+
43
+ #print out the description of every recommended product
44
+ df_search = df.iloc[list(indices)[0]].drop(['text_vector_'], axis=1) #.sort_values('avgFeedbackScore', ascending=False)
45
+
46
+ return df_search.sort_values('price', ascending=False)
47
+
48
+ def filter_df(df, column_name, filter_type, filter_value):
49
+ if filter_type == '==':
50
+ df_filtered = df[df[column_name]==filter_value]
51
+ elif filter_type == '>=':
52
+ df_filtered = df[df[column_name]>=filter_value]
53
+ elif filter_type == '<=':
54
+ df_filtered = df[df[column_name]<=filter_value]
55
+ return df_filtered
56
+
57
+ history = list()
58
+ def predict(input1, input2, input3, input4):
59
+ history.append([input1, input2, input3, input4])
60
+
61
+ print(history)
62
+ df_location = filter_df(df, 'location', '==', input3)
63
+ df_size = filter_df(df_location, 'sq. meters', '==', input2)
64
+ df_price = filter_df(df_size, 'price', '<=', input1)
65
+ df_result = search(df_price, input4)
66
+
67
+ prediction = [
68
+ round(statistics.mean([x[0] for x in history])), #price
69
+ closest_number(statistics.mean([x[1] for x in history])), #square room
70
+ statistics.mode([x[2] for x in history]) #state
71
+ ]
72
+
73
+ return df_result, prediction
74
+
75
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:
76
+ gr.Markdown(
77
+ """
78
+ # Airbnb Search Engine
79
+ """
80
+ )
81
+ input1 = gr.Slider(100, 1200, value=700, step_size=100, label="Max Price")
82
+ input2 = gr.Radio([25, 40, 45, 55, 60, 70], multiselect=False, label='square meters', value=45)
83
+ input3 = gr.Radio(['Manhattan', 'Brooklyn', 'Queens', 'Bronx'], multiselect=False, label='State', value='Queens')
84
+ input4 = gr.Textbox(label='Query', value='I want to take a break from work 😴!!!')
85
+
86
+ btn = gr.Button(value="Search for a Room")
87
+ output1 = gr.Dataframe()
88
+ output2 = gr.Textbox(label='prediction for the next search')
89
+ # btn.click(greet, inputs='text', outputs=['dataframe'])
90
+ btn.click(predict, [input1, input2, input3, input4], [output1, output2])
91
+ demo.launch(share=False)
df_encoded.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efe09f27cabb790b1de79ba1483bceded0499ef48627bde47756b1905dd72a91
3
+ size 48169491