antonbol commited on
Commit
331fa48
1 Parent(s): 7a25fef

removed title

Browse files
Files changed (1) hide show
  1. feature_engineering.py +15 -15
feature_engineering.py CHANGED
@@ -48,30 +48,30 @@ def feat_eng(df):
48
  # )
49
 
50
  # Drop features and NaNs
51
- df.drop(["Ticket", "Cabin", "Fare", "PassengerId"], axis=1, inplace=True)
52
  df = df[df["Embarked"].notna()]
53
 
54
  # Feature engineering
55
  # Creat a title feature
56
- if "Name" in df.columns:
57
- df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.")
58
- df.drop("Name", axis=1, inplace=True)
59
 
60
- # Interpolate missing ages
61
- for title in df["Title"].unique():
62
- # This sould be optimized
63
- mask = (df["Title"] == title) & df["Age"].isna()
64
 
65
- # Get sutible candidates for age sampling
66
- candidates = df.loc[(df["Title"] == title) & df["Age"].notna()]
67
 
68
- g = candidates.groupby("Age", dropna=True)["Age"].count()
69
- g = g.apply(lambda x: x / g.sum())
70
 
71
- weights = g.to_numpy()
72
- ages = g.index
73
 
74
- df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights)))
75
 
76
  # Cast age to int
77
  df["Age"] = df["Age"].astype("int")
 
48
  # )
49
 
50
  # Drop features and NaNs
51
+ df.drop(["Ticket", "Cabin", "Fare", "PassengerId", "Title"], axis=1, inplace=True)
52
  df = df[df["Embarked"].notna()]
53
 
54
  # Feature engineering
55
  # Creat a title feature
56
+ # if "Name" in df.columns:
57
+ # df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.")
58
+ # df.drop("Name", axis=1, inplace=True)
59
 
60
+ # # Interpolate missing ages
61
+ # for title in df["Title"].unique():
62
+ # # This sould be optimized
63
+ # mask = (df["Title"] == title) & df["Age"].isna()
64
 
65
+ # # Get sutible candidates for age sampling
66
+ # candidates = df.loc[(df["Title"] == title) & df["Age"].notna()]
67
 
68
+ # g = candidates.groupby("Age", dropna=True)["Age"].count()
69
+ # g = g.apply(lambda x: x / g.sum())
70
 
71
+ # weights = g.to_numpy()
72
+ # ages = g.index
73
 
74
+ # df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights)))
75
 
76
  # Cast age to int
77
  df["Age"] = df["Age"].astype("int")