In [1]:
import os

In [2]:
%pwd

'c:\\Users\\nikhil\\OneDrive\\Desktop\\ML Projects\\ipp\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\nikhil\\OneDrive\\Desktop\\ML Projects\\ipp'

In [28]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir : Path
    train_data_path : Path
    test_data_path : Path
    preprocessor_obj_file_path : Path
    prepared_train_data : Path
    prepared_test_data : Path

In [29]:
from insurancePP.constants import *
from insurancePP.utils.common import read_yaml, create_directories, save_object

In [30]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            preprocessor_obj_file_path = config.preprocessor_obj_file_path,
            prepared_train_data = config.prepared_train_data,
            prepared_test_data = config.prepared_test_data
        )

        return data_transformation_config

In [31]:
import os
import urllib.request as request
import zipfile
from insurancePP.logging import logger
from insurancePP.utils.common import get_size
from datasets import load_from_disk
from insurancePP.logging import logger

import numpy as np
from numpy import save
import pandas as pd
import sys

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [36]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config


    def get_data_transformer_object(self):
        try:
            categorical_columns = ['sex', 'smoker', 'region']
            numerical_columns = ['age', 'bmi', 'children']
            logger.info("Numerical and Categorical features has been extracted from dataset")

            logger.info("creating categorical pipeline")
            categorical_pipeline = Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
                ('onehot', OneHotEncoder()),
                ('scaler', StandardScaler(with_mean=False))
            ])

            logger.info("creating numerical pipelines")
            numerical_pipeline = Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ])

            logger.info("creating column transformer object")
            preprocessor = ColumnTransformer([
                ('numerical_pipeline', numerical_pipeline, numerical_columns),
                ('categorical_pipeline', categorical_pipeline, categorical_columns)
            ])

            return preprocessor
        
        except Exception as e:
            raise e


    def initiate_data_transformation(self):
        try:
            train_df = pd.read_csv(self.config.train_data_path)
            test_df = pd.read_csv(self.config.test_data_path)

            logger.info('obtaining preprocessing object')
            preprocessor_obj = self.get_data_transformer_object()

            target_column_name = 'expenses'
            input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1)
            target_feature_train_df = train_df[target_column_name]


            input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1)
            target_feature_test_df = test_df[target_column_name]

            logger.info('Applying preprocessing object on training and testing dataframe')

            input_feature_train_arr = preprocessor_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr = preprocessor_obj.transform(input_feature_test_df)

            train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            save(self.config.prepared_train_data, train_arr)
            save(self.config.prepared_test_data, test_arr)

            save_object(
                file_path = Path(self.config.preprocessor_obj_file_path),
                obj = preprocessor_obj
            )


        except Exception as e:
            raise e

    

In [37]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config)
    data_transformation.get_data_transformer_object()
    data_transformation.initiate_data_transformation()
    
except Exception as e:
    raise e

[2024-02-25 18:53:02,160 : INFO : common : yaml file: config\config.yaml loaded successfully]
[2024-02-25 18:53:02,164 : INFO : common : yaml file: params.yaml loaded successfully]
[2024-02-25 18:53:02,168 : INFO : common : directory artifacts created]
[2024-02-25 18:53:02,170 : INFO : common : directory artifacts/data_transformation created]
[2024-02-25 18:53:02,172 : INFO : 1099139855 : Numerical and Categorical features has been extracted from dataset]
[2024-02-25 18:53:02,174 : INFO : 1099139855 : creating categorical pipeline]
[2024-02-25 18:53:02,175 : INFO : 1099139855 : creating numerical pipelines]
[2024-02-25 18:53:02,177 : INFO : 1099139855 : creating column transformer object]
[2024-02-25 18:53:02,189 : INFO : 1099139855 : obtaining preprocessing object]
[2024-02-25 18:53:02,190 : INFO : 1099139855 : Numerical and Categorical features has been extracted from dataset]
[2024-02-25 18:53:02,193 : INFO : 1099139855 : creating categorical pipeline]
[2024-02-25 18:53:02,195 : INF