Spaces:
Sleeping
Sleeping
singhjagpreet
commited on
Commit
•
bf0670d
1
Parent(s):
eb2eadc
data trasformation
Browse files- logs/09_09_2023_20_29_47.log/09_09_2023_20_29_47.log +9 -0
- logs/09_09_2023_20_30_41.log/09_09_2023_20_30_41.log +10 -0
- logs/09_09_2023_20_31_40.log/09_09_2023_20_31_40.log +10 -0
- src/__pycache__/utils.cpython-310.pyc +0 -0
- src/components/__pycache__/__init__.cpython-310.pyc +0 -0
- src/components/__pycache__/data_transformation.cpython-310.pyc +0 -0
- src/components/data_ingestion.py +7 -1
- src/components/data_transformation.py +111 -0
- src/utils.py +19 -0
logs/09_09_2023_20_29_47.log/09_09_2023_20_29_47.log
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[ 2023-09-09 20:29:51,791 ] 25 root - INFO - Entered the data ingestion method or component
|
2 |
+
[ 2023-09-09 20:29:51,795 ] 28 root - INFO - read the dataset as dataframe
|
3 |
+
[ 2023-09-09 20:29:51,799 ] 37 root - INFO - Train test split initiated
|
4 |
+
[ 2023-09-09 20:29:51,804 ] 44 root - INFO - ingestion of data completed
|
5 |
+
[ 2023-09-09 20:29:51,806 ] 68 root - INFO - read train and test data completed
|
6 |
+
[ 2023-09-09 20:29:51,806 ] 70 root - INFO - obtaining preprocessing object
|
7 |
+
[ 2023-09-09 20:29:51,806 ] 44 root - INFO - numerical columns: ['writing_score', 'reading_score']
|
8 |
+
[ 2023-09-09 20:29:51,806 ] 51 root - INFO - categorical columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
|
9 |
+
[ 2023-09-09 20:29:51,806 ] 81 root - INFO - applying preprocessing object on training and testing dataframe
|
logs/09_09_2023_20_30_41.log/09_09_2023_20_30_41.log
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[ 2023-09-09 20:30:42,388 ] 25 root - INFO - Entered the data ingestion method or component
|
2 |
+
[ 2023-09-09 20:30:42,391 ] 28 root - INFO - read the dataset as dataframe
|
3 |
+
[ 2023-09-09 20:30:42,394 ] 37 root - INFO - Train test split initiated
|
4 |
+
[ 2023-09-09 20:30:42,398 ] 44 root - INFO - ingestion of data completed
|
5 |
+
[ 2023-09-09 20:30:42,400 ] 68 root - INFO - read train and test data completed
|
6 |
+
[ 2023-09-09 20:30:42,400 ] 70 root - INFO - obtaining preprocessing object
|
7 |
+
[ 2023-09-09 20:30:42,400 ] 44 root - INFO - numerical columns: ['writing_score', 'reading_score']
|
8 |
+
[ 2023-09-09 20:30:42,400 ] 51 root - INFO - categorical columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
|
9 |
+
[ 2023-09-09 20:30:42,400 ] 81 root - INFO - applying preprocessing object on training and testing dataframe
|
10 |
+
[ 2023-09-09 20:30:42,408 ] 100 root - INFO - saved preprocessing object.
|
logs/09_09_2023_20_31_40.log/09_09_2023_20_31_40.log
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[ 2023-09-09 20:31:41,630 ] 25 root - INFO - Entered the data ingestion method or component
|
2 |
+
[ 2023-09-09 20:31:41,633 ] 28 root - INFO - read the dataset as dataframe
|
3 |
+
[ 2023-09-09 20:31:41,635 ] 37 root - INFO - Train test split initiated
|
4 |
+
[ 2023-09-09 20:31:41,639 ] 44 root - INFO - ingestion of data completed
|
5 |
+
[ 2023-09-09 20:31:41,641 ] 68 root - INFO - read train and test data completed
|
6 |
+
[ 2023-09-09 20:31:41,641 ] 70 root - INFO - obtaining preprocessing object
|
7 |
+
[ 2023-09-09 20:31:41,641 ] 44 root - INFO - numerical columns: ['writing_score', 'reading_score']
|
8 |
+
[ 2023-09-09 20:31:41,641 ] 51 root - INFO - categorical columns: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
|
9 |
+
[ 2023-09-09 20:31:41,641 ] 81 root - INFO - applying preprocessing object on training and testing dataframe
|
10 |
+
[ 2023-09-09 20:31:41,648 ] 100 root - INFO - saved preprocessing object.
|
src/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (651 Bytes). View file
|
|
src/components/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (158 Bytes). View file
|
|
src/components/__pycache__/data_transformation.cpython-310.pyc
ADDED
Binary file (3.17 kB). View file
|
|
src/components/data_ingestion.py
CHANGED
@@ -7,6 +7,9 @@ import pandas as pd
|
|
7 |
from sklearn.model_selection import train_test_split
|
8 |
from dataclasses import dataclass
|
9 |
|
|
|
|
|
|
|
10 |
@dataclass
|
11 |
class DataIngestionConfig:
|
12 |
train_data_path: str=os.path.join('artifacts','train.csv')
|
@@ -49,4 +52,7 @@ class DataIngestion:
|
|
49 |
|
50 |
if __name__ == '__main__':
|
51 |
obj=DataIngestion()
|
52 |
-
obj.intiate_data_ingestion()
|
|
|
|
|
|
|
|
7 |
from sklearn.model_selection import train_test_split
|
8 |
from dataclasses import dataclass
|
9 |
|
10 |
+
from src.components.data_transformation import DataTransformation,DataTransformationConfig
|
11 |
+
|
12 |
+
|
13 |
@dataclass
|
14 |
class DataIngestionConfig:
|
15 |
train_data_path: str=os.path.join('artifacts','train.csv')
|
|
|
52 |
|
53 |
if __name__ == '__main__':
|
54 |
obj=DataIngestion()
|
55 |
+
train_data_path, test_data_path = obj.intiate_data_ingestion()
|
56 |
+
|
57 |
+
data_transformation = DataTransformation()
|
58 |
+
data_transformation.initiate_data_tranformation(train_data_path,test_data_path)
|
src/components/data_transformation.py
CHANGED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from dataclasses import dataclass
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
from sklearn.compose import ColumnTransformer
|
8 |
+
from sklearn.impute import SimpleImputer
|
9 |
+
from sklearn.preprocessing import OneHotEncoder,StandardScaler
|
10 |
+
from sklearn.pipeline import Pipeline
|
11 |
+
|
12 |
+
|
13 |
+
from src.exception import CustomException
|
14 |
+
from src.logger import logging
|
15 |
+
from src.utils import save_object
|
16 |
+
|
17 |
+
@dataclass
|
18 |
+
class DataTransformationConfig:
|
19 |
+
preprocessor_ob_file_path = os.path.join('artifacts','preprocessor.pkl')
|
20 |
+
|
21 |
+
class DataTransformation:
|
22 |
+
def __init__(self):
|
23 |
+
self.data_transformation_config = DataTransformationConfig()
|
24 |
+
self.numerical_columns = ['writing_score','reading_score']
|
25 |
+
self.categorical_columns = ['gender',
|
26 |
+
'race_ethnicity',
|
27 |
+
'parental_level_of_education',
|
28 |
+
'lunch',
|
29 |
+
'test_preparation_course']
|
30 |
+
self.target_column_name = 'math_score'
|
31 |
+
|
32 |
+
def get_data_transformer_object(self):
|
33 |
+
"""
|
34 |
+
|
35 |
+
function performs data transformation
|
36 |
+
"""
|
37 |
+
try:
|
38 |
+
|
39 |
+
|
40 |
+
num_pipeline = Pipeline(steps=[
|
41 |
+
('imputer',SimpleImputer(strategy='median')),
|
42 |
+
('scaller',StandardScaler())
|
43 |
+
])
|
44 |
+
logging.info(f"numerical columns: {self.numerical_columns}")
|
45 |
+
|
46 |
+
cat_pipeline = Pipeline(steps=[
|
47 |
+
('imputer',SimpleImputer(strategy='most_frequent')),
|
48 |
+
('ohe',OneHotEncoder(drop='first',handle_unknown='ignore'))
|
49 |
+
|
50 |
+
])
|
51 |
+
logging.info(f"categorical columns: {self.categorical_columns}")
|
52 |
+
|
53 |
+
preprocessor = ColumnTransformer([
|
54 |
+
('num_pipeline',num_pipeline,self.numerical_columns),
|
55 |
+
('cat_pipeline',cat_pipeline,self.categorical_columns)
|
56 |
+
]
|
57 |
+
)
|
58 |
+
|
59 |
+
return preprocessor
|
60 |
+
except Exception as e:
|
61 |
+
raise CustomException(e,sys)
|
62 |
+
|
63 |
+
def initiate_data_tranformation(self, train_path, test_path):
|
64 |
+
try:
|
65 |
+
train_df = pd.read_csv(train_path)
|
66 |
+
test_df = pd.read_csv(test_path)
|
67 |
+
|
68 |
+
logging.info('read train and test data completed')
|
69 |
+
|
70 |
+
logging.info('obtaining preprocessing object')
|
71 |
+
|
72 |
+
preprocessing_obj = self.get_data_transformer_object()
|
73 |
+
|
74 |
+
input_feature_train_df = train_df.drop(self.target_column_name,axis=1)
|
75 |
+
target_feature_train_df = train_df[self.target_column_name]
|
76 |
+
|
77 |
+
|
78 |
+
input_feature_test_df = test_df.drop(self.target_column_name,axis=1)
|
79 |
+
target_feature_test_df = test_df[self.target_column_name]
|
80 |
+
|
81 |
+
logging.info(f"applying preprocessing object on training and testing dataframe")
|
82 |
+
|
83 |
+
input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
|
84 |
+
input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)
|
85 |
+
|
86 |
+
train_arr = np.c_[
|
87 |
+
input_feature_train_arr, np.array(target_feature_train_df)
|
88 |
+
]
|
89 |
+
|
90 |
+
test_arr = np.c_[
|
91 |
+
input_feature_test_arr, np.array(target_feature_test_df)
|
92 |
+
]
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
save_object(
|
97 |
+
file_path = self.data_transformation_config.preprocessor_ob_file_path,
|
98 |
+
obj = preprocessing_obj
|
99 |
+
)
|
100 |
+
logging.info(f"saved preprocessing object.")
|
101 |
+
|
102 |
+
return (
|
103 |
+
train_arr,
|
104 |
+
test_arr,
|
105 |
+
self.data_transformation_config.preprocessor_ob_file_path
|
106 |
+
)
|
107 |
+
|
108 |
+
|
109 |
+
except Exception as e:
|
110 |
+
raise CustomException(e,sys)
|
111 |
+
|
src/utils.py
CHANGED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import pickle
|
7 |
+
|
8 |
+
|
9 |
+
from src.exception import CustomException
|
10 |
+
|
11 |
+
def save_object(file_path, obj):
|
12 |
+
try:
|
13 |
+
dir_path = os.path.dirname(file_path)
|
14 |
+
os.makedirs(dir_path,exist_ok=True)
|
15 |
+
with open(file_path, 'wb') as file_obj:
|
16 |
+
pickle.dump(obj,file_obj)
|
17 |
+
|
18 |
+
except Exception as e:
|
19 |
+
raise CustomException(e,sys)
|