File size: 4,894 Bytes
4ec7aed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Import necessary modules
import sys
import os

import numpy as np 
import pandas as pd

from dataclasses import dataclass

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from src.exception import CustomException
from src.logger import logging
from src.utils import save_object

# Define a configuration class for data transformation settings
@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path = os.path.join('artifacts', "preprocessor.pkl")

# Define the main class responsible for data transformation
class DataTransformation:
    def __init__(self):
        self.data_transformation_config = DataTransformationConfig()

    # Function to create a preprocessing object
    def get_data_transformer_object(self):

        '''

        This function is Responsible for Data Transformation

        Age                                int64

        Education_Level                   object

        Occupation                        object

        Number_of_Dependents               int64

        Location                          object

        Work_Experience                    int64

        Marital_Status                    object

        Employment_Status                 object

        Household_Size                     int64

        Homeownership_Status              object

        Type_of_Housing                   object

        Gender                            object

        Primary_Mode_of_Transportation    object

        Income                             int64

        Age_Group                         object

        Living_Standards                  object

        Cluster                            int64

        dtype: object

        '''
        
        try:
            # Define the categorical and numerical columns
            categorical_columns = [
                "Primary_Mode_of_Transportation", "Education_Level", "Occupation",
                "Marital_Status", "Living_Standards", "Gender", "Homeownership_Status",
                "Location", "Type_of_Housing", "Employment_Status", "Age_Group"]
            
            numerical_columns = ["Work_Experience", "Number_of_Dependents", "Household_Size", "Age", "Cluster"]

            num_pipeline= Pipeline(steps=[("scaler",StandardScaler())])
            cat_pipeline=Pipeline(steps=[("one_hot_encoder",OneHotEncoder()),("scaler",StandardScaler(with_mean=False))])

            logging.info(f"Categorical columns: {categorical_columns}")
            logging.info(f"Numerical columns: {numerical_columns}")

            preprocessor=ColumnTransformer([("num_pipeline",num_pipeline,numerical_columns),("cat_pipelines",cat_pipeline,categorical_columns)])

            return preprocessor
        
        except Exception as e:
            raise CustomException(e,sys)

    def initiate_data_transformation(self, train_path, test_path):
        try:
            train_df=pd.read_csv(train_path)
            test_df=pd.read_csv(test_path)

            logging.info("Read train and test data completed")
            logging.info("Obtaining preprocessing object")

            preprocessing_obj=self.get_data_transformer_object()

            target_column_name = "Income"
            input_features = [
                "Primary_Mode_of_Transportation", "Education_Level", "Occupation",
                "Marital_Status", "Living_Standards", "Gender", "Homeownership_Status",
                "Location", "Type_of_Housing", "Employment_Status", "Age_Group",
                "Work_Experience", "Number_of_Dependents", "Household_Size", "Age", "Cluster"
            ]

            input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
            target_feature_train_df=train_df[target_column_name]

            input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
            target_feature_test_df=test_df[target_column_name]

            logging.info(f"Applying preprocessing object on training dataframe and testing dataframe.")

            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

            train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            logging.info(f"Saved preprocessing object.")

            save_object(file_path=self.data_transformation_config.preprocessor_obj_file_path,obj=preprocessing_obj)

            return (train_arr,test_arr,self.data_transformation_config.preprocessor_obj_file_path,)
        
        except Exception as e:
            raise CustomException(e,sys)