Spaces:
Sleeping
Sleeping
# scripts/generate_data.py | |
import numpy as np | |
import pandas as pd | |
import os | |
def generate_dataset(n_samples=1000): | |
np.random.seed(42) | |
study_hours = np.random.normal(10, 2, n_samples) | |
tuition_hours = np.random.normal(5, 1, n_samples) | |
parental_education = np.random.choice(['High', 'Medium', 'Low'], n_samples) | |
school_type = np.random.choice(['Public', 'Private'], n_samples) | |
exam_score = 50 + 2 * study_hours + 1.5 * tuition_hours + np.random.normal(0, 5, n_samples) | |
df = pd.DataFrame({ | |
'StudyHours': study_hours, | |
'TuitionHours': tuition_hours, | |
'ParentalEducation': parental_education, | |
'SchoolType': school_type, | |
'FinalExamScore': exam_score | |
}) | |
# Ensure data directory exists | |
os.makedirs('../data', exist_ok=True) | |
df.to_csv('../data/sample_dataset.csv', index=False) | |
return df | |
if __name__ == "__main__": | |
generate_dataset() | |
print("Dataset generated and saved to ../data/sample_dataset.csv") | |