kcelia commited on
Commit
b9461e1
1 Parent(s): 9904b2a

chore: remove preprocessing file

Browse files
Files changed (1) hide show
  1. preprocessing.py +0 -69
preprocessing.py DELETED
@@ -1,69 +0,0 @@
1
- """
2
- Preliminary preprocessing on the data, such as:
3
- - correcting column names
4
- - encoding the target column
5
- """
6
-
7
- import pandas as pd
8
- from sklearn import preprocessing
9
-
10
- # Files location
11
- TRAINING_FILE_NAME = "./data/Training.csv"
12
- TESTING_FILE_NAME = "./data/Testing.csv"
13
-
14
- # Columns processing
15
- TARGET_COLUMN = "prognosis"
16
- DROP_COLUMNS = ["Unnamed: 133"]
17
-
18
- RENAME_COLUMNS = {
19
- "scurring": "scurving",
20
- "dischromic _patches": "dischromic_patches",
21
- "spotting_ urination": "spotting_urination",
22
- "foul_smell_of urine": "foul_smell_of_urine",
23
- }
24
-
25
- RENAME_VALUES = {
26
- "(vertigo) Paroymsal Positional Vertigo": "Paroymsal Positional Vertigo",
27
- "Dimorphic hemmorhoids(piles)": "Dimorphic hemmorhoids (piles)",
28
- "Peptic ulcer diseae": "Peptic Ulcer",
29
- }
30
-
31
- if __name__ == "__main__":
32
-
33
- # Load data
34
- df_train = pd.read_csv(TRAINING_FILE_NAME)
35
- df_test = pd.read_csv(TESTING_FILE_NAME)
36
-
37
- # Remove unseless columns
38
- df_train.drop(columns=DROP_COLUMNS, axis=1, errors="ignore", inplace=True)
39
- df_test.drop(columns=DROP_COLUMNS, axis=1, errors="ignore", inplace=True)
40
-
41
- # Correct some typos in some columns name
42
- df_train.rename(columns=RENAME_COLUMNS, inplace=True)
43
- df_test.rename(columns=RENAME_COLUMNS, inplace=True)
44
-
45
- df_train[TARGET_COLUMN].replace(RENAME_VALUES.keys(), RENAME_VALUES.values(), inplace=True)
46
- df_train[TARGET_COLUMN] = df_train[TARGET_COLUMN].apply(str.title)
47
-
48
- df_test[TARGET_COLUMN].replace(RENAME_VALUES.keys(), RENAME_VALUES.values(), inplace=True)
49
- df_test[TARGET_COLUMN] = df_test[TARGET_COLUMN].apply(str.title)
50
-
51
- # Convert the `TARGET_COLUMN` to a numeric label
52
- label_encoder = preprocessing.LabelEncoder()
53
- label_encoder.fit(df_train[[TARGET_COLUMN]].values.flatten())
54
-
55
- df_train[f"{TARGET_COLUMN}_encoded"] = label_encoder.transform(
56
- df_train[[TARGET_COLUMN]].values.flatten()
57
- )
58
- df_test[f"{TARGET_COLUMN}_encoded"] = label_encoder.transform(
59
- df_test[[TARGET_COLUMN]].values.flatten()
60
- )
61
-
62
- # Cast X features from int64 to float32
63
- float_columns = df_train.columns.drop([TARGET_COLUMN])
64
- df_train[float_columns] = df_train[float_columns].astype("float32")
65
- df_test[float_columns] = df_test[float_columns].astype("float32")
66
-
67
- # Save preprocessed data
68
- df_train.to_csv(path_or_buf="./data/Training_preprocessed.csv", index=False)
69
- df_test.to_csv(path_or_buf="./data/Testing_preprocessed.csv", index=False)