Spaces:

SURESHBEEKHANI
/

StudentExamPerformancePrediction

Sleeping

App Files Files Community

StudentExamPerformancePrediction / src /components /data_ingestion.py

SURESHBEEKHANI

Upload 15 files

72475ed verified 9 months ago

raw

history blame contribute delete

3.74 kB

	# Import necessary libraries and modules
	import os # For file and directory operations
	import sys # For system-specific parameters and functions
	from ..exception import CustomException

	from src.logger import logging # Adjusted to absolute import

	import pandas as pd # For data manipulation and analysis

	from sklearn.model_selection import train_test_split # For splitting data into training and testing sets
	from dataclasses import dataclass # For creating data classes

	#Import components for data transformation and model training
	from src.components.data_transformation import DataTransformation # Data transformation class
	from src.components.data_transformation import DataTransformationConfig # Configuration for data transformation
	from src.components.model_trainer import ModelTrainerConfig # Configuration for model training
	from src.components.model_trainer import ModelTrainer # Model training class

	# Define a data class for Data Ingestion Configuration
	@dataclass
	class DataIngestionConfig:
	# Specify file paths for training, testing, and raw data
	train_data_path: str = os.path.join('artifacts', "train.csv")
	test_data_path: str = os.path.join('artifacts', "test.csv")
	raw_data_path: str = os.path.join('artifacts', "data.csv")

	# Data Ingestion Class Definition
	class DataIngestion:
	def __init__(self):
	# Initialize ingestion configuration
	self.ingestion_config = DataIngestionConfig()

	# Method to initiate data ingestion
	def initiate_data_ingestion(self):
	logging.info("Entered the data ingestion method or component")
	try:
	# Read the dataset from the specified path
	df = pd.read_csv('notebook/data/stud.csv') # Update path based on your directory structure
	logging.info('Read the dataset as dataframe')

	# Create necessary directories if they do not exist
	os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True)

	# Save the raw data to CSV
	df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True)

	logging.info("Train test split initiated")
	# Split the data into training and testing sets
	train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

	# Save the training and testing sets to CSV
	train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True)
	test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True)

	logging.info("Ingestion of the data is completed")

	# Return the paths of the train and test data
	return (
	self.ingestion_config.train_data_path,
	self.ingestion_config.test_data_path
	)
	except Exception as e:
	raise CustomException(e, sys) # Raise custom exception on error

	# Main block to execute the data ingestion process
	if __name__ == "__main__":
	# Create an instance of DataIngestion
	obj = DataIngestion()
	# Initiate data ingestion and get train and test data paths
	train_data, test_data = obj.initiate_data_ingestion()

	# Create an instance of DataTransformation
	data_transformation = DataTransformation()
	# Perform data transformation and obtain training and testing arrays
	train_arr, test_arr, _ = data_transformation.initiate_data_transformation(train_data, test_data)

	# Create an instance of ModelTrainer
	modeltrainer = ModelTrainer()
	# Train the model and print the result
	print(modeltrainer.initiate_model_trainer(train_arr, test_arr))