Spaces:
Sleeping
Sleeping
File size: 2,803 Bytes
4ec7aed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# Import necessary modules
import os
import sys
import pandas as pd
from src.exception import CustomException
from src.logger import logging
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from src.components.data_transformation import DataTransformation, DataTransformationConfig
from src.components.model_trainer import ModelTrainerConfig, ModelTrainer
# Define a configuration class for data ingestion settings
@dataclass
class DataIngestionConfig:
train_data_path: str = os.path.join('artifacts', "train.csv")
test_data_path: str = os.path.join('artifacts', "test.csv")
raw_data_path: str = os.path.join('artifacts', "data.csv")
# Define the main class responsible for data ingestion
class DataIngestion:
def __init__(self):
self.ingestion_config = DataIngestionConfig()
# Function to initiate data ingestion
def initiate_data_ingestion(self):
logging.info("Entered the Data Ingestion Method and Components")
try:
# Read the dataset into a dataframe
df = pd.read_csv('EDA/data/data_modified.csv')
logging.info('Read the Dataset as Dataframe')
# Create the directory for saving data if it doesn't exist
os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True)
# Save the raw data to a CSV file
df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True)
logging.info("Train Test Split Initiated")
# Split the dataset into training and testing sets
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True)
test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True)
logging.info("Ingestion of the Data is Completed")
# Return the paths of the training and testing data
return (
self.ingestion_config.train_data_path,
self.ingestion_config.test_data_path
)
except Exception as e:
# Raise a custom exception if an error occurs
raise CustomException(e, sys)
# Main execution block to run data ingestion
if __name__ == "__main__":
obj = DataIngestion()
train_data, test_data = obj.initiate_data_ingestion()
data_transformation = DataTransformation()
train_arr, test_arr, preprocessor_file_path = data_transformation.initiate_data_transformation(train_data, test_data)
modeltrainer = ModelTrainer()
print(modeltrainer.initiate_model_trainer(train_arr, test_arr))
|