CausalBox / scripts /generate_data.py
ShutterStack's picture
major changes
ab66d4e verified
# scripts/generate_data.py
import numpy as np
import pandas as pd
import os
def generate_dataset(n_samples=1000):
np.random.seed(42)
study_hours = np.random.normal(10, 2, n_samples)
tuition_hours = np.random.normal(5, 1, n_samples)
parental_education = np.random.choice(['High', 'Medium', 'Low'], n_samples)
school_type = np.random.choice(['Public', 'Private'], n_samples)
exam_score = 50 + 2 * study_hours + 1.5 * tuition_hours + np.random.normal(0, 5, n_samples)
df = pd.DataFrame({
'StudyHours': study_hours,
'TuitionHours': tuition_hours,
'ParentalEducation': parental_education,
'SchoolType': school_type,
'FinalExamScore': exam_score
})
# Ensure data directory exists
os.makedirs('../data', exist_ok=True)
df.to_csv('../data/sample_dataset.csv', index=False)
return df
if __name__ == "__main__":
generate_dataset()
print("Dataset generated and saved to ../data/sample_dataset.csv")