Spaces:

antonbol
/

titanic

Runtime error

antonbol commited on Nov 15, 2022

Commit

331fa48

•

1 Parent(s): 7a25fef

removed title

Files changed (1) hide show

feature_engineering.py CHANGED Viewed

@@ -48,30 +48,30 @@ def feat_eng(df):
     # )
     # Drop features and NaNs
-    df.drop(["Ticket", "Cabin", "Fare", "PassengerId"], axis=1, inplace=True)
     df = df[df["Embarked"].notna()]
     # Feature engineering
     # Creat a title feature
-    if "Name" in df.columns:
-        df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.")
-        df.drop("Name", axis=1, inplace=True)
-    # Interpolate missing ages
-    for title in df["Title"].unique():
-        # This sould be optimized
-        mask = (df["Title"] == title) & df["Age"].isna()
-        # Get sutible candidates for age sampling
-        candidates = df.loc[(df["Title"] == title) & df["Age"].notna()]
-        g = candidates.groupby("Age", dropna=True)["Age"].count()
-        g = g.apply(lambda x: x / g.sum())
-        weights = g.to_numpy()
-        ages = g.index
-        df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights)))
     # Cast age to int
     df["Age"] = df["Age"].astype("int")

     # )
     # Drop features and NaNs
+    df.drop(["Ticket", "Cabin", "Fare", "PassengerId", "Title"], axis=1, inplace=True)
     df = df[df["Embarked"].notna()]
     # Feature engineering
     # Creat a title feature
+    # if "Name" in df.columns:
+    #     df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.")
+    #     df.drop("Name", axis=1, inplace=True)
+    # # Interpolate missing ages
+    # for title in df["Title"].unique():
+    #     # This sould be optimized
+    #     mask = (df["Title"] == title) & df["Age"].isna()
+    #     # Get sutible candidates for age sampling
+    #     candidates = df.loc[(df["Title"] == title) & df["Age"].notna()]
+    #     g = candidates.groupby("Age", dropna=True)["Age"].count()
+    #     g = g.apply(lambda x: x / g.sum())
+    #     weights = g.to_numpy()
+    #     ages = g.index
+    #     df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights)))
     # Cast age to int
     df["Age"] = df["Age"].astype("int")