File size: 2,434 Bytes
13fb76e
 
 
 
 
 
 
 
 
58df7f1
 
 
 
 
 
 
 
13fb76e
 
 
 
 
 
 
58df7f1
 
 
 
 
13fb76e
 
 
 
58df7f1
 
13fb76e
 
58df7f1
 
13fb76e
 
 
 
 
58df7f1
 
 
 
 
 
 
13fb76e
58df7f1
13fb76e
58df7f1
 
 
 
 
 
13fb76e
 
58df7f1
13fb76e
 
 
 
58df7f1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
Preliminary preprocessing on the data, such as:
- correcting column names
- encoding the target column
"""

import pandas as pd
from sklearn import preprocessing

# Files location
TRAINING_FILE_NAME = "./data/Training.csv"
TESTING_FILE_NAME = "./data/Testing.csv"

# Columns processing
TARGET_COLUMN = "prognosis"
DROP_COLUMNS = ["Unnamed: 133"]

RENAME_COLUMNS = {
    "scurring": "scurving",
    "dischromic _patches": "dischromic_patches",
    "spotting_ urination": "spotting_urination",
    "foul_smell_of urine": "foul_smell_of_urine",
}

RENAME_VALUES = {
    "(vertigo) Paroymsal  Positional Vertigo": "Paroymsal Positional Vertigo",
    "Dimorphic hemmorhoids(piles)": "Dimorphic hemmorhoids (piles)",
    "Peptic ulcer diseae": "Peptic Ulcer",
}

if __name__ == "__main__":

    # Load data
    df_train = pd.read_csv(TRAINING_FILE_NAME)
    df_test = pd.read_csv(TESTING_FILE_NAME)

    # Remove unseless columns
    df_train.drop(columns=DROP_COLUMNS, axis=1, errors="ignore", inplace=True)
    df_test.drop(columns=DROP_COLUMNS, axis=1, errors="ignore", inplace=True)

    # Correct some typos in some columns name
    df_train.rename(columns=RENAME_COLUMNS, inplace=True)
    df_test.rename(columns=RENAME_COLUMNS, inplace=True)

    df_train[TARGET_COLUMN].replace(RENAME_VALUES.keys(), RENAME_VALUES.values(), inplace=True)
    df_train[TARGET_COLUMN] = df_train[TARGET_COLUMN].apply(str.title)

    df_test[TARGET_COLUMN].replace(RENAME_VALUES.keys(), RENAME_VALUES.values(), inplace=True)
    df_test[TARGET_COLUMN] = df_test[TARGET_COLUMN].apply(str.title)

    # Convert the `TARGET_COLUMN` to a numeric label
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(df_train[[TARGET_COLUMN]].values.flatten())

    df_train[f"{TARGET_COLUMN}_encoded"] = label_encoder.transform(
        df_train[[TARGET_COLUMN]].values.flatten()
    )
    df_test[f"{TARGET_COLUMN}_encoded"] = label_encoder.transform(
        df_test[[TARGET_COLUMN]].values.flatten()
    )

    # Cast X features from int64 to float32
    float_columns = df_train.columns.drop([TARGET_COLUMN])
    df_train[float_columns] = df_train[float_columns].astype("float32")
    df_test[float_columns] = df_test[float_columns].astype("float32")

    # Save preprocessed data
    df_train.to_csv(path_or_buf="./data/Training_preprocessed.csv", index=False)
    df_test.to_csv(path_or_buf="./data/Testing_preprocessed.csv", index=False)