File size: 2,654 Bytes
33e9cd7
 
7a25fef
 
33e9cd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a45d93
 
 
33e9cd7
 
20f693c
33e9cd7
 
 
 
331fa48
 
 
33e9cd7
331fa48
 
 
 
33e9cd7
331fa48
 
33e9cd7
331fa48
 
33e9cd7
331fa48
 
33e9cd7
331fa48
33e9cd7
 
 
 
ab3b679
33e9cd7
ab3b679
 
33e9cd7
 
ab3b679
 
33e9cd7
 
7a25fef
33e9cd7
 
7a25fef
33e9cd7
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

################ Dicts with encodings ################
# cabin_dict= "Cabin": {"N": 0, "C": 1, "E": 2, "G": 3, "D":4, "A": 5, "B": 6, "F": 7, "T": 8}
cleanup_catergories = {"sex": {"female": 1, "male": 0}, "embarked": {"S": 0, "C": 1, "Q": 2}}

sex_dict = {"female": 1, "male": 0}
embarked_dict = {"S": 0, "C": 1, "Q": 2}

# Reversed
"""
title_dict = {
    0: ["Mr"],
    1: ["Miss"],
    2: ["Mrs"],
    3: ["Master"],
    # Rare titles, not worth individual categorys
    4: [
        "Dr",
        "Rev",
        "Mlle",
        "Major",
        "Col",
        "Countess",
        "Capt",
        "Ms",
        "Sir",
        "Lady",
        "Nme",
        "Don",
        "Jonkheer",
    ],
}
"""
#####################################################

def feat_eng(df):
    """
    Main function containg the feature engineering part
    of the pipeline.
    """
    import pandas as pd
    import numpy as np
    import hopsworks

    # Load the data_frame
    # df = pd.read_csv(
    #     "https://raw.githubusercontent.com/ID2223KTH/id2223kth.github.io/master/assignments/lab1/titanic.csv"
    # )

    # Drop features and NaNs
    df.drop(["Ticket", "Cabin", "Fare", "PassengerId", "Name"], axis=1, inplace=True)
    df = df[df["Embarked"].notna()]

    # Feature engineering
    # Creat a title feature
    # if "Name" in df.columns:
    #     df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.")
    #     df.drop("Name", axis=1, inplace=True)

    # # Interpolate missing ages
    # for title in df["Title"].unique():
    #     # This sould be optimized
    #     mask = (df["Title"] == title) & df["Age"].isna()

    #     # Get sutible candidates for age sampling
    #     candidates = df.loc[(df["Title"] == title) & df["Age"].notna()]

    #     g = candidates.groupby("Age", dropna=True)["Age"].count()
    #     g = g.apply(lambda x: x / g.sum())

    #     weights = g.to_numpy()
    #     ages = g.index

    #     df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights)))

    # Cast age to int
    df["Age"] = df["Age"].astype("int")
    # Bin ages
    # df['Age'] = pd.cut(df['Age'],[0,8,15,30,65,150])

    # # Bin fare
    # df['Fare'] = pd.cut(df['Fare'],[0,200,400,600,1000])
    
    
    # # Bin SibSp
    # pd.cut(df['SibSp'], [0,1,2,7], right=False)

    # Cabin into categories based on first letter(deck of boat)
    # df["Cabin"] = df["Cabin"].str.slice(0,1)

    # Make a separate category of all te NANs
    # df["Cabin"] = df["Cabin"].fillna("N")

    # Fixes for hopsworks...
    df.columns = df.columns.str.lower()

    # Final encoding
    df = df.replace(cleanup_catergories)

    return df