# Import necessary modules import os import sys import pandas as pd from src.exception import CustomException from src.logger import logging from sklearn.model_selection import train_test_split from dataclasses import dataclass from src.components.data_transformation import DataTransformation, DataTransformationConfig from src.components.model_trainer import ModelTrainerConfig, ModelTrainer # Define a configuration class for data ingestion settings @dataclass class DataIngestionConfig: train_data_path: str = os.path.join('artifacts', "train.csv") test_data_path: str = os.path.join('artifacts', "test.csv") raw_data_path: str = os.path.join('artifacts', "data.csv") # Define the main class responsible for data ingestion class DataIngestion: def __init__(self): self.ingestion_config = DataIngestionConfig() # Function to initiate data ingestion def initiate_data_ingestion(self): logging.info("Entered the Data Ingestion Method and Components") try: # Read the dataset into a dataframe df = pd.read_csv('EDA/data/data_modified.csv') logging.info('Read the Dataset as Dataframe') # Create the directory for saving data if it doesn't exist os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True) # Save the raw data to a CSV file df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True) logging.info("Train Test Split Initiated") # Split the dataset into training and testing sets train_set, test_set = train_test_split(df, test_size=0.2, random_state=42) train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True) test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True) logging.info("Ingestion of the Data is Completed") # Return the paths of the training and testing data return ( self.ingestion_config.train_data_path, self.ingestion_config.test_data_path ) except Exception as e: # Raise a custom exception if an error occurs raise CustomException(e, sys) # Main execution block to run data ingestion if __name__ == "__main__": obj = DataIngestion() train_data, test_data = obj.initiate_data_ingestion() data_transformation = DataTransformation() train_arr, test_arr, preprocessor_file_path = data_transformation.initiate_data_transformation(train_data, test_data) modeltrainer = ModelTrainer() print(modeltrainer.initiate_model_trainer(train_arr, test_arr))