antonbol commited on
Commit
33e9cd7
1 Parent(s): e8c8ffd

added feature_eng

Browse files
Files changed (1) hide show
  1. feature_engineering.py +101 -0
feature_engineering.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ################ Dicts with encodings ################
3
+ cleanup_catergories = {"sex": {"female": 1, "male": 0}, "embarked": {"S": 0, "C": 1, "Q": 2}, "Cabin": {"N": 0, "C": 1, "E": 2, "G": 3, "D":4, "A": 5, "B": 6, "F": 7, "T": 8}}
4
+
5
+ sex_dict = {"female": 1, "male": 0}
6
+ embarked_dict = {"S": 0, "C": 1, "Q": 2}
7
+
8
+ # Reversed
9
+ """
10
+ title_dict = {
11
+ 0: ["Mr"],
12
+ 1: ["Miss"],
13
+ 2: ["Mrs"],
14
+ 3: ["Master"],
15
+ # Rare titles, not worth individual categorys
16
+ 4: [
17
+ "Dr",
18
+ "Rev",
19
+ "Mlle",
20
+ "Major",
21
+ "Col",
22
+ "Countess",
23
+ "Capt",
24
+ "Ms",
25
+ "Sir",
26
+ "Lady",
27
+ "Nme",
28
+ "Don",
29
+ "Jonkheer",
30
+ ],
31
+ }
32
+ """
33
+ #####################################################
34
+
35
+ def feat_eng(df):
36
+ """
37
+ Main function containg the feature engineering part
38
+ of the pipeline.
39
+ """
40
+ import pandas as pd
41
+ import numpy as np
42
+ import hopsworks
43
+
44
+ # Load the data_frame
45
+ df = pd.read_csv(
46
+ "https://raw.githubusercontent.com/ID2223KTH/id2223kth.github.io/master/assignments/lab1/titanic.csv"
47
+ )
48
+
49
+ # Drop features and NaNs
50
+ df.drop(["Ticket", "Fare"], axis=1, inplace=True)
51
+ df = df[df["Embarked"].notna()]
52
+
53
+ # Feature engineering
54
+ # Creat a title feature
55
+ if "Name" in df.columns:
56
+ df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.")
57
+ df.drop("Name", axis=1, inplace=True)
58
+
59
+ # Interpolate missing ages
60
+ for title in df["Title"].unique():
61
+ # This sould be optimized
62
+ mask = (df["Title"] == title) & df["Age"].isna()
63
+
64
+ # Get sutible candidates for age sampling
65
+ candidates = df.loc[(df["Title"] == title) & df["Age"].notna()]
66
+
67
+ g = candidates.groupby("Age", dropna=True)["Age"].count()
68
+ g = g.apply(lambda x: x / g.sum())
69
+
70
+ weights = g.to_numpy()
71
+ ages = g.index
72
+
73
+ df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights)))
74
+
75
+ # Cast age to int
76
+ df["Age"] = df["Age"].astype("int")
77
+ # Bin ages
78
+ df['Age'] = pd.cut(df['Age'],[0,8,15,30,65,150])
79
+
80
+ # Bin fare
81
+ df['Fare'] = pd.cut(df['Fare'],[0,200,400,600,1000])
82
+
83
+
84
+ # Bin SibSp
85
+ pd.cut(df['SibSp'], [0,1,2,7], right=False)
86
+
87
+ # Cabin into categories based on first letter(deck of boat)
88
+ df["Cabin"] = df["Cabin"].str.slice(0,1)
89
+
90
+ # Make a separate category of all te NANs
91
+ df["Cabin"] = df["Cabin"].fillna("N")
92
+
93
+ # Fixes for hopsworks...
94
+ df.columns = df.columns.str.lower()
95
+
96
+ # Final encoding
97
+ df = df.replace(cleanup_catergories)
98
+
99
+ return df
100
+
101
+