Tom Beer commited on
Commit
cda78c3
1 Parent(s): 995f54d

add data_processing.ipynb

Browse files
Files changed (1) hide show
  1. data_processing.ipynb +408 -0
data_processing.ipynb ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import seaborn as sns\n",
10
+ "import json\n",
11
+ "import pandas as pd\n",
12
+ "from numpy import mean, percentile, array\n",
13
+ "from numpy.random import permutation as perm\n",
14
+ "from pathlib import Path"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 2,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "base_dir = Path()\n",
24
+ "data_dir = base_dir / \"data\""
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 3,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "def read_jsonl(file_path):\n",
34
+ " data = []\n",
35
+ " with open(file_path, \"r\") as file:\n",
36
+ " for i,line in enumerate(file):\n",
37
+ " data.append(json.loads(line))\n",
38
+ " return data\n",
39
+ "\n",
40
+ "reviews = read_jsonl(data_dir / \"cmu\" / \"raw\" / \"review.txt\")\n",
41
+ "offering = read_jsonl(data_dir / \"cmu\" / \"raw\" / \"offering.txt\")\n"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 25,
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "with open(data_dir / \"cmu\" / \"processed\" / \"cities.json\", \"w\") as f:\n",
51
+ " json.dump(list(cities), f)\n",
52
+ " \n",
53
+ "with open(data_dir / \"cmu\" / \"processed\" / \"score_threshold_per_city.json\", \"w\") as f:\n",
54
+ " json.dump(score_threshold_per_city, f)\n",
55
+ "\n",
56
+ "with open(data_dir / \"cmu\" / \"processed\" / \"city_to_hotel_id_map.json\", \"w\") as f:\n",
57
+ " json.dump(city_to_hotel_id_map, f)\n",
58
+ "\n",
59
+ "with open(data_dir / \"cmu\" / \"processed\" / \"hotel_id_to_name_map.json\", \"w\") as f:\n",
60
+ " json.dump(hotel_id_to_name_map, f)\n",
61
+ "\n",
62
+ "with open(data_dir / \"cmu\" / \"processed\" / \"hotel_id_to_review_map.json\", \"w\") as f:\n",
63
+ " json.dump(hotel_id_to_review_map, f)\n",
64
+ " "
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 4,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "cities = set([hotel['address']['locality'] for hotel in offering])\n",
74
+ "city_to_hotel_id_map = {city: [hotel['id'] for hotel in offering \n",
75
+ " if hotel['address']['locality'] == city] for city in cities}"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 5,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "hotel_id_to_review_map = {}\n",
85
+ "for review in reviews:\n",
86
+ " review_info = {'text': review['text'], 'score': review['ratings']['overall'], 'num_helpful': review['num_helpful_votes']}\n",
87
+ " hotel_id_to_review_map.setdefault(review['offering_id'], []).append(review_info)\n",
88
+ "\n",
89
+ "for hotel_id, review_info in hotel_id_to_review_map.items():\n",
90
+ " average_score = mean([rev['score'] for rev in review_info])\n",
91
+ " hotel_id_to_review_map[hotel_id] = {'average_score': average_score, 'reviews': review_info}\n"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 6,
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "hotel_id_to_name_map = {hotel[\"id\"]: hotel[\"name\"] for hotel in offering}"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 7,
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "data": {
110
+ "image/png": "",
111
+ "text/plain": [
112
+ "<Figure size 640x480 with 1 Axes>"
113
+ ]
114
+ },
115
+ "metadata": {},
116
+ "output_type": "display_data"
117
+ }
118
+ ],
119
+ "source": [
120
+ "sns.histplot([len(hotel['reviews']) for hotel in hotel_id_to_review_map.values()]);"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": []
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 8,
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "def calc_score_threshold_per_city(p):\n",
137
+ " res = {}\n",
138
+ " for city, idxs in city_to_hotel_id_map.items():\n",
139
+ " res[city] = percentile([hotel_id_to_review_map.get(idx, {'average_score': 0})['average_score'] for idx in idxs], p) \n",
140
+ " return res"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": null,
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": []
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": 9,
153
+ "metadata": {},
154
+ "outputs": [],
155
+ "source": [
156
+ "score_threshold_per_city=calc_score_threshold_per_city(80)"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": 11,
162
+ "metadata": {},
163
+ "outputs": [],
164
+ "source": [
165
+ "reviews_per_hotel_per_city = {}\n",
166
+ "for city in city_to_hotel_id_map:\n",
167
+ " for hotel_id in city_to_hotel_id_map[city]:\n",
168
+ " n_reviews = len(hotel_id_to_review_map.get(hotel_id, {'reviews': []})['reviews'])\n",
169
+ " reviews_per_hotel_per_city.setdefault(city, []).append(n_reviews)"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": 13,
175
+ "metadata": {},
176
+ "outputs": [
177
+ {
178
+ "name": "stdout",
179
+ "output_type": "stream",
180
+ "text": [
181
+ "Boston: 73 hotels with more than 100 reviews\n",
182
+ "Seattle: 86 hotels with more than 100 reviews\n",
183
+ "San Jose: 24 hotels with more than 100 reviews\n",
184
+ "Charlotte: 49 hotels with more than 100 reviews\n",
185
+ "Chicago: 106 hotels with more than 100 reviews\n",
186
+ "Washington DC: 106 hotels with more than 100 reviews\n",
187
+ "Fort Worth: 15 hotels with more than 100 reviews\n",
188
+ "Jacksonville: 39 hotels with more than 100 reviews\n",
189
+ "Denver: 70 hotels with more than 100 reviews\n",
190
+ "Los Angeles: 142 hotels with more than 100 reviews\n",
191
+ "New York City: 327 hotels with more than 100 reviews\n",
192
+ "Dallas: 61 hotels with more than 100 reviews\n",
193
+ "Memphis: 42 hotels with more than 100 reviews\n",
194
+ "Phoenix: 62 hotels with more than 100 reviews\n",
195
+ "San Diego: 148 hotels with more than 100 reviews\n",
196
+ "Austin: 63 hotels with more than 100 reviews\n",
197
+ "Baltimore: 41 hotels with more than 100 reviews\n",
198
+ "San Antonio: 89 hotels with more than 100 reviews\n",
199
+ "Detroit: 17 hotels with more than 100 reviews\n",
200
+ "Indianapolis: 44 hotels with more than 100 reviews\n",
201
+ "San Francisco: 177 hotels with more than 100 reviews\n",
202
+ "Houston: 70 hotels with more than 100 reviews\n",
203
+ "Columbus: 38 hotels with more than 100 reviews\n",
204
+ "Philadelphia: 61 hotels with more than 100 reviews\n",
205
+ "El Paso: 11 hotels with more than 100 reviews\n"
206
+ ]
207
+ }
208
+ ],
209
+ "source": [
210
+ "for city, num_reviews in reviews_per_hotel_per_city.items():\n",
211
+ " score_threshold = score_threshold_per_city[city]\n",
212
+ " for idx in city_to_hotel_id_map[city]:\n",
213
+ " hotel_id_to_review_map.get(idx, {'average_score': 0})['average_score']\n",
214
+ " print(f\"{city}: {(array(num_reviews) > 70).sum()} hotels with more than 100 reviews\")"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": null,
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": [
223
+ "offering"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "markdown",
228
+ "metadata": {},
229
+ "source": [
230
+ "memory in gradio space\n",
231
+ "\n",
232
+ "Steps:\n",
233
+ "* Embed space in site\n",
234
+ "* Save preprocessed files\n",
235
+ "* Load from interface\n",
236
+ "* Return recommendation for boston irrespective of text input\n",
237
+ " * Return context for llm as output\n",
238
+ " * Set up open ai, return raw output with basic prompt\n",
239
+ "* Scroll menu for city\n",
240
+ "* Check box for kid friendly\n",
241
+ "* At the end - understand free text input\n",
242
+ " "
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 17,
248
+ "metadata": {},
249
+ "outputs": [],
250
+ "source": [
251
+ "hotel_id_to_review_map = {\n",
252
+ " hotel_id: {\n",
253
+ " \"reviews\": [\n",
254
+ " review for review in hotel_data[\"reviews\"] if review[\"num_helpful\"] > 10\n",
255
+ " ]\n",
256
+ " }\n",
257
+ " for hotel_id, hotel_data in hotel_id_to_review_map.items()\n",
258
+ " if len(hotel_id_to_review_map[hotel_id]['reviews']) > 100\n",
259
+ " and hotel_id_to_review_map[hotel_id]['average_score'] >= score_threshold_per_city[hotel_id_to_city_map[hotel_id]]\n",
260
+ "}"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 18,
266
+ "metadata": {},
267
+ "outputs": [],
268
+ "source": [
269
+ "hotel_id_to_city_map = {vi:k for k,v in city_to_hotel_id_map.items() for vi in v}"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 23,
275
+ "metadata": {},
276
+ "outputs": [
277
+ {
278
+ "name": "stdout",
279
+ "output_type": "stream",
280
+ "text": [
281
+ "{'hotel_name': 'Four Seasons Hotel Boston', 'positive': [{'text': \"FSHB is one of the best hotels in the country. Its employees are pleasant, professional, and well trained. They always have the guests' best interests in mind. The hotel is beautiful, the rooms spacious, clean, andextremely comfortable. The restaurant, Aujourd'hui, is among the best in the city. The food is exquisite. Room service and brunch are also wonderful. This is the best Boston has to offer!\", 'score': 5.0, 'num_helpful': 11}, {'text': 'I had the opportunity to stay at the 4 seasons for the Boston Marathon. It was my first time to Boston (where I had wanted to go since my early twenties, (48 now)!. I have to say the 4 seasons made this the best experience of my life. From the pickup at the airport to the drop off I was simply amazed! Everyone was so nice, Daniel the concierge helped me get a tour of the city prior to other relatives arriving and referred me to 5th Avenue Limo Service. Use them please, they are great! The driver was so knowledgeable and darn I cant remember his name! The service, the room, the ammenities were all above par. I have to admit the only thing i did not like was the darn curtain in the shower. seriously, all hotels that r 4 or 5 star even 3, need to have shower enclosures! the bedding was excellent, the executive suite overlooking boston gardens and boston common - oh my - the view and listening to the people out on the street was great. you can open a window from these rooms and feel the ambiance of the city. I will stay at the 4 seasons probably every time i return to boston!!! kudos to the hotel management!', 'score': 5.0, 'num_helpful': 11}, {'text': 'Just returned from a visit to Four Seasons Boston and the service was excellent. We arrived early in the AM from a red-eye flight and the terrific woman at the front desk (Eliva?)not only let us check in early but upgraded us to a newly renovated room. The doormen,bellman,concierge and staff at the health club all were first rate. Health club facilities were great, enjoyed the newly renovated steam room and sauna. Of course, as is typical of the Four Seasons the bed was sublime. We have stayed at this property in the past and never been disappointed. Last year due to some of the negative reviews about Four Seasons Boston we stayed at Beacon XV- what a mistake! The service at Beacon XV is truly awful, does not even compare.', 'score': 5.0, 'num_helpful': 11}], 'negative': [{'text': \"After staying at the hotel for a wedding, my room was broken into and a significant amount of Jewelry stolen. The Boston Police Department has since found the thief and has a video of the gentlemen leaving my hotel room. Through this terrible ordeal the 4 Seasons has been unbelievable. They have claimed that it is not their responsibility and refuse to insure my belongings. On top of everything, I was shocked when they didn't even bother to comp my room after I had to deal with a police report all day. Do not stay here unless you want to deal with a rude and terrible staff at a 2-3 star hotel. Try the Taj\", 'score': 1.0, 'num_helpful': 14}]}\n"
282
+ ]
283
+ }
284
+ ],
285
+ "source": [
286
+ "city = \"Boston\"\n",
287
+ "score_threshold = score_threshold_per_city[city]\n",
288
+ "for hotel_id in perm(city_to_hotel_id_map[city]):\n",
289
+ " try:\n",
290
+ " hotel_reviews = hotel_id_to_review_map[hotel_id]['reviews']\n",
291
+ " except KeyError:\n",
292
+ " continue\n",
293
+ " res = {\"hotel_name\": hotel_id_to_name_map[hotel_id], 'positive': [], 'negative': []} \n",
294
+ " hotel_reviews = hotel_id_to_review_map[hotel_id]['reviews']\n",
295
+ " for review in perm(hotel_reviews):\n",
296
+ " if review['num_helpful'] > 10:\n",
297
+ " if (review['score'] == 5) & (len(res['positive']) < 3):\n",
298
+ " res['positive'].append(review)\n",
299
+ " if (review['score'] <= 2) & (len(res['negative']) < 1):\n",
300
+ " res['negative'].append(review)\n",
301
+ " if (len(res['positive']) >= 3) & (len(res['negative']) >= 1):\n",
302
+ " break\n",
303
+ "\n",
304
+ "print(res)\n"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": null,
310
+ "metadata": {},
311
+ "outputs": [],
312
+ "source": []
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": null,
317
+ "metadata": {},
318
+ "outputs": [],
319
+ "source": []
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": null,
324
+ "metadata": {},
325
+ "outputs": [],
326
+ "source": []
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": null,
331
+ "metadata": {},
332
+ "outputs": [],
333
+ "source": [
334
+ " "
335
+ ]
336
+ },
337
+ {
338
+ "cell_type": "code",
339
+ "execution_count": null,
340
+ "metadata": {},
341
+ "outputs": [],
342
+ "source": []
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": null,
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": []
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "execution_count": null,
354
+ "metadata": {},
355
+ "outputs": [],
356
+ "source": []
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": null,
361
+ "metadata": {},
362
+ "outputs": [],
363
+ "source": []
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": null,
368
+ "metadata": {},
369
+ "outputs": [],
370
+ "source": []
371
+ },
372
+ {
373
+ "cell_type": "code",
374
+ "execution_count": null,
375
+ "metadata": {},
376
+ "outputs": [],
377
+ "source": []
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": null,
382
+ "metadata": {},
383
+ "outputs": [],
384
+ "source": []
385
+ }
386
+ ],
387
+ "metadata": {
388
+ "kernelspec": {
389
+ "display_name": "adults",
390
+ "language": "python",
391
+ "name": "adults"
392
+ },
393
+ "language_info": {
394
+ "codemirror_mode": {
395
+ "name": "ipython",
396
+ "version": 3
397
+ },
398
+ "file_extension": ".py",
399
+ "mimetype": "text/x-python",
400
+ "name": "python",
401
+ "nbconvert_exporter": "python",
402
+ "pygments_lexer": "ipython3",
403
+ "version": "3.9.9"
404
+ }
405
+ },
406
+ "nbformat": 4,
407
+ "nbformat_minor": 4
408
+ }