################ Dicts with encodings ################ # cabin_dict= "Cabin": {"N": 0, "C": 1, "E": 2, "G": 3, "D":4, "A": 5, "B": 6, "F": 7, "T": 8} cleanup_catergories = {"sex": {"female": 1, "male": 0}, "embarked": {"S": 0, "C": 1, "Q": 2}} sex_dict = {"female": 1, "male": 0} embarked_dict = {"S": 0, "C": 1, "Q": 2} # Reversed """ title_dict = { 0: ["Mr"], 1: ["Miss"], 2: ["Mrs"], 3: ["Master"], # Rare titles, not worth individual categorys 4: [ "Dr", "Rev", "Mlle", "Major", "Col", "Countess", "Capt", "Ms", "Sir", "Lady", "Nme", "Don", "Jonkheer", ], } """ ##################################################### def feat_eng(df): """ Main function containg the feature engineering part of the pipeline. """ import pandas as pd import numpy as np import hopsworks # Load the data_frame # df = pd.read_csv( # "https://raw.githubusercontent.com/ID2223KTH/id2223kth.github.io/master/assignments/lab1/titanic.csv" # ) # Drop features and NaNs df.drop(["Ticket", "Cabin", "Fare", "PassengerId", "Name"], axis=1, inplace=True) df = df[df["Embarked"].notna()] # Feature engineering # Creat a title feature # if "Name" in df.columns: # df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.") # df.drop("Name", axis=1, inplace=True) # # Interpolate missing ages # for title in df["Title"].unique(): # # This sould be optimized # mask = (df["Title"] == title) & df["Age"].isna() # # Get sutible candidates for age sampling # candidates = df.loc[(df["Title"] == title) & df["Age"].notna()] # g = candidates.groupby("Age", dropna=True)["Age"].count() # g = g.apply(lambda x: x / g.sum()) # weights = g.to_numpy() # ages = g.index # df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights))) # Cast age to int df["Age"] = df["Age"].astype("int") # Bin ages # df['Age'] = pd.cut(df['Age'],[0,8,15,30,65,150]) # # Bin fare # df['Fare'] = pd.cut(df['Fare'],[0,200,400,600,1000]) # # Bin SibSp # pd.cut(df['SibSp'], [0,1,2,7], right=False) # Cabin into categories based on first letter(deck of boat) # df["Cabin"] = df["Cabin"].str.slice(0,1) # Make a separate category of all te NANs # df["Cabin"] = df["Cabin"].fillna("N") # Fixes for hopsworks... df.columns = df.columns.str.lower() # Final encoding df = df.replace(cleanup_catergories) return df