DrChamyoung commited on
Commit
b05ab4f
1 Parent(s): b445706

Create ModelOptimization.py

Browse files
Files changed (1) hide show
  1. ModelOptimization.py +103 -0
ModelOptimization.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import json
4
+ from Quin.Core import ModelOptimization
5
+
6
+ EPSILON = 1e-5
7
+
8
+
9
+ class FeatureEngineer(ModelOptimization):
10
+
11
+ def apply(self, df, k, condition):
12
+ df[k] = df['features'].apply(condition)
13
+ df[k] = df[k].astype(np.int8)
14
+
15
+ def fit(self, X, y=None, **fit_params):
16
+ return self
17
+
18
+ def transform(self, X, y=None):
19
+ df = X.copy()
20
+
21
+ df.features = df.features.apply(lambda x: ' '.join([y.replace(' ', '_') for y in x]))
22
+ df.features = df.features.apply(lambda x: x.lower())
23
+ df.features = df.features.apply(lambda x: x.replace('-', '_'))
24
+
25
+ for k, condition in (('dishwasher', lambda x: 'dishwasher' in x),
26
+ ('doorman', lambda x: 'doorman' in x or 'concierge' in x),
27
+ ('pets', lambda x: "pets" in x or "pet" in x or "dog" in x or "cats" in x and "no_pets" not in x),
28
+ ('air_conditioning', lambda x: 'air_conditioning' in x or 'central' in x),
29
+ ('parking', lambda x: 'parking' in x),
30
+ ('balcony', lambda x: 'balcony' in x or 'deck' in x or 'terrace' in x or 'patio' in x),
31
+ ('bike', lambda x: 'bike' in x),
32
+ ('storage', lambda x: 'storage' in x),
33
+ ('outdoor', lambda x: 'outdoor' in x or 'courtyard' in x or 'garden' in x),
34
+ ('roof', lambda x: 'roof' in x),
35
+ ('gym', lambda x: 'gym' in x or 'fitness' in x),
36
+ ('pool', lambda x: 'pool' in x),
37
+ ('backyard', lambda x: 'backyard' in x),
38
+ ('laundry', lambda x: 'laundry' in x),
39
+ ('hardwood_floors', lambda x: 'hardwood_floors' in x),
40
+ ('new_construction', lambda x: 'new_construction' in x),
41
+ ('dryer', lambda x: 'dryer' in x),
42
+ ('elevator', lambda x: 'elevator' in x),
43
+ ('garage', lambda x: 'garage' in x),
44
+ ('pre_war', lambda x: 'pre_war' in x or 'prewar' in x),
45
+ ('post_war', lambda x: 'post_war' in x or 'postwar' in x),
46
+ ('no_fee', lambda x: 'no_fee' in x),
47
+ ('low_fee', lambda x: 'reduced_fee' in x or 'low_fee' in x),
48
+ ('fire', lambda x: 'fireplace' in x),
49
+ ('private', lambda x: 'private' in x),
50
+ ('wheelchair', lambda x: 'wheelchair' in x),
51
+ ('internet', lambda x: 'wifi' in x or 'wi_fi' in x or 'internet' in x),
52
+ ('yoga', lambda x: 'yoga' in x),
53
+ ('furnished', lambda x: 'furnished' in x),
54
+ ('multi_level', lambda x: 'multi_level' in x),
55
+ ('exclusive', lambda x: 'exclusive' in x),
56
+ ('high_ceil', lambda x: 'high_ceil' in x),
57
+ ('green', lambda x: 'green_b' in x),
58
+ ('stainless', lambda x: 'stainless_' in x),
59
+ ('simplex', lambda x: 'simplex' in x),
60
+ ('public', lambda x: 'public' in x),
61
+ ):
62
+ self.apply(df, k, condition)
63
+
64
+ df['bathrooms'] = df['bathrooms'].apply(lambda x: x if x < 5 else 5)
65
+ df['bedrooms'] = df['bedrooms'].apply(lambda x: x if x < 5 else 5)
66
+ df["num_photos"] = df["photos"].apply(len)
67
+ df["num_features"] = df["features"].apply(len)
68
+ created = pd.to_datetime(df.pop("created"))
69
+ df["listing_age"] = (pd.to_datetime('today') - created).apply(lambda x: x.days)
70
+ df["room_dif"] = df["bedrooms"] - df["bathrooms"]
71
+ df["room_sum"] = df["bedrooms"] + df["bathrooms"]
72
+ df["price_per_room"] = df["price"] / df["room_sum"].apply(lambda x: max(x, .5))
73
+ df["bedrooms_share"] = df["bedrooms"] / df["room_sum"].apply(lambda x: max(x, .5))
74
+ df['price'] = df['price'].apply(lambda x: np.log(x + EPSILON))
75
+
76
+ key_types = df.dtypes.to_dict()
77
+ for k in key_types:
78
+ if key_types[k].name not in ('int64', 'float64', 'int8'):
79
+ df.pop(k)
80
+
81
+ for k in ('latitude', 'longitude', 'listing_id'):
82
+ df.pop(k)
83
+ return df
84
+
85
+
86
+ def encode(x):
87
+ if x == 'low':
88
+ return 0
89
+ elif x == 'medium':
90
+ return 1
91
+ elif x == 'high':
92
+ return 2
93
+
94
+
95
+ def get_data():
96
+ with open('train.json', 'r') as raw_data:
97
+ data = json.load(raw_data)
98
+
99
+ df = pd.DataFrame(data)
100
+ target = df.pop('interest_level').apply(encode)
101
+
102
+ df = FeatureEngineer().fit_transform(df)
103
+ return df, target