SURESHBEEKHANI's picture
Upload 15 files
72475ed verified
# Import necessary libraries and modules
import os # For file and directory operations
import sys # For system-specific parameters and functions
from ..exception import CustomException
from src.logger import logging # Adjusted to absolute import
import pandas as pd # For data manipulation and analysis
from sklearn.model_selection import train_test_split # For splitting data into training and testing sets
from dataclasses import dataclass # For creating data classes
#Import components for data transformation and model training
from src.components.data_transformation import DataTransformation # Data transformation class
from src.components.data_transformation import DataTransformationConfig # Configuration for data transformation
from src.components.model_trainer import ModelTrainerConfig # Configuration for model training
from src.components.model_trainer import ModelTrainer # Model training class
# Define a data class for Data Ingestion Configuration
@dataclass
class DataIngestionConfig:
# Specify file paths for training, testing, and raw data
train_data_path: str = os.path.join('artifacts', "train.csv")
test_data_path: str = os.path.join('artifacts', "test.csv")
raw_data_path: str = os.path.join('artifacts', "data.csv")
# Data Ingestion Class Definition
class DataIngestion:
def __init__(self):
# Initialize ingestion configuration
self.ingestion_config = DataIngestionConfig()
# Method to initiate data ingestion
def initiate_data_ingestion(self):
logging.info("Entered the data ingestion method or component")
try:
# Read the dataset from the specified path
df = pd.read_csv('notebook/data/stud.csv') # Update path based on your directory structure
logging.info('Read the dataset as dataframe')
# Create necessary directories if they do not exist
os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True)
# Save the raw data to CSV
df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True)
logging.info("Train test split initiated")
# Split the data into training and testing sets
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
# Save the training and testing sets to CSV
train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True)
test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True)
logging.info("Ingestion of the data is completed")
# Return the paths of the train and test data
return (
self.ingestion_config.train_data_path,
self.ingestion_config.test_data_path
)
except Exception as e:
raise CustomException(e, sys) # Raise custom exception on error
# Main block to execute the data ingestion process
if __name__ == "__main__":
# Create an instance of DataIngestion
obj = DataIngestion()
# Initiate data ingestion and get train and test data paths
train_data, test_data = obj.initiate_data_ingestion()
# Create an instance of DataTransformation
data_transformation = DataTransformation()
# Perform data transformation and obtain training and testing arrays
train_arr, test_arr, _ = data_transformation.initiate_data_transformation(train_data, test_data)
# Create an instance of ModelTrainer
modeltrainer = ModelTrainer()
# Train the model and print the result
print(modeltrainer.initiate_model_trainer(train_arr, test_arr))