Spaces:
Sleeping
Sleeping
| """ | |
| Preliminary preprocessing on the data, such as: | |
| - correcting column names | |
| - encoding the target column | |
| """ | |
| import pandas as pd | |
| from sklearn import preprocessing | |
| COLUMNS_TO_DROP = ["Unnamed: 133"] | |
| TARGET_COLUMN = ["prognosis"] | |
| RENAME_COLUMNS = { | |
| "scurring": "scurving", | |
| "dischromic _patches": "dischromic_patches", | |
| "spotting_ urination": "spotting_urination", | |
| "foul_smell_of urine": "foul_smell_of_urine", | |
| } | |
| def pretty_print(input): | |
| """ | |
| Prettify the input. | |
| Args: | |
| input: Can be a list of symtoms or a disease. | |
| Returns: | |
| list: Sorted and prettified input. | |
| """ | |
| # Convert to a list if necessary | |
| if isinstance(input, list): | |
| input = list(input) | |
| # Flatten the list if required | |
| pretty_list = [] | |
| for item in input: | |
| if isinstance(item, list): | |
| pretty_list.extend(item) | |
| else: | |
| pretty_list.append(item) | |
| # Sort and prettify the input | |
| pretty_list = sorted([" ".join((item.split("_"))).title() for item in pretty_list]) | |
| return pretty_list | |
| def map_prediction(target_columns=["y", "prognosis"]): | |
| df = pd.read_csv("Training_preprocessed.csv") | |
| relevent_df = df[target_columns].drop_duplicates().relevent_df.where(df["y"] == 1) | |
| prediction = relevent_df[target_columns[1]].dropna().values[0] | |
| return prediction | |
| if __name__ == "__main__": | |
| # Load data | |
| df_train = pd.read_csv("Training.csv") | |
| df_test = pd.read_csv("Testing.csv") | |
| # Remove unseless columns | |
| df_train.drop(columns=COLUMNS_TO_DROP, axis=1, errors="ignore", inplace=True) | |
| df_test.drop(columns=COLUMNS_TO_DROP, axis=1, errors="ignore", inplace=True) | |
| # Correct some typos in some columns name | |
| df_train.rename(columns=RENAME_COLUMNS, inplace=True) | |
| df_test.rename(columns=RENAME_COLUMNS, inplace=True) | |
| # Convert y category labels to y | |
| label_encoder = preprocessing.LabelEncoder() | |
| label_encoder.fit(df_train[TARGET_COLUMN].values.flatten()) | |
| df_train["y"] = label_encoder.transform(df_train[TARGET_COLUMN].values.flatten()) | |
| df_test["y"] = label_encoder.transform(df_test[TARGET_COLUMN].values.flatten()) | |
| # Cast X features from int64 to float32 | |
| float_columns = df_train.columns.drop(TARGET_COLUMN) | |
| df_train[float_columns] = df_train[float_columns].astype("float32") | |
| df_test[float_columns] = df_test[float_columns].astype("float32") | |
| # Save preprocessed data | |
| df_train.to_csv(path_or_buf="Training_preprocessed.csv", index=False) | |
| df_test.to_csv(path_or_buf="Testing_preprocessed.csv", index=False) | |