{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "# Load the existing data\n", "data = pd.read_csv(\"StudentsPerformance.csv\") # Replace \"existing_data.csv\" with your actual file path\n", "\n", "# Extract existing scores\n", "math_scores = data['math score']\n", "reading_scores = data['reading score']\n", "writing_scores = data['writing score']\n", "\n", "# Analyze existing scores\n", "# You can calculate mean, standard deviation, etc. for each subject\n", "math_mean = math_scores.mean()\n", "math_std = math_scores.std()\n", "\n", "reading_mean = reading_scores.mean()\n", "reading_std = reading_scores.std()\n", "\n", "writing_mean = writing_scores.mean()\n", "writing_std = writing_scores.std()\n", "\n", "# Generate synthetic scores for additional subjects\n", "num_students = len(data)\n", "synthetic_math_scores = np.round(np.random.normal(math_mean, math_std, num_students))\n", "synthetic_reading_scores = np.round(np.random.normal(reading_mean, reading_std, num_students))\n", "synthetic_writing_scores = np.round(np.random.normal(writing_mean, writing_std, num_students))\n", "\n", "# Scale the synthetic scores to fit within 0-100 range\n", "synthetic_math_scores = np.clip(synthetic_math_scores, 0, 100)\n", "synthetic_reading_scores = np.clip(synthetic_reading_scores, 0, 100)\n", "synthetic_writing_scores = np.clip(synthetic_writing_scores, 0, 100)\n", "\n", "# Calculate internal marks out of 30 for each subject\n", "internal_marks_scale = 30 / 100\n", "internal_math_marks = np.round(synthetic_math_scores * internal_marks_scale)\n", "internal_reading_marks = np.round(synthetic_reading_scores * internal_marks_scale)\n", "internal_writing_marks = np.round(synthetic_writing_scores * internal_marks_scale)\n", "\n", "# Create new DataFrame with synthetic data\n", "synthetic_data = pd.DataFrame({\n", " 'gender': data['gender'],\n", " 'race/ethnicity': data['race/ethnicity'],\n", " 'parental level of education': data['parental level of education'],\n", " 'lunch': data['lunch'],\n", " 'test preparation course': data['test preparation course'],\n", " 'math score': synthetic_math_scores.astype(int),\n", " 'reading score': synthetic_reading_scores.astype(int),\n", " 'writing score': synthetic_writing_scores.astype(int),\n", " 'internal math marks': internal_math_marks.astype(int),\n", " 'internal reading marks': internal_reading_marks.astype(int),\n", " 'internal writing marks': internal_writing_marks.astype(int)\n", "})\n", "\n", "# Save synthetic data to a new CSV file\n", "synthetic_data.to_csv(\"synthetic_data.csv\", index=False) # Replace \"synthetic_data.csv\" with your desired file name\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "# Load the existing data\n", "data = pd.read_csv(\"synthetic_data.csv\") # Replace \"existing_data.csv\" with your actual file path\n", "\n", "# Extract existing scores\n", "math_scores = data['math score']\n", "reading_scores = data['reading score']\n", "writing_scores = data['writing score']\n", "\n", "# Analyze existing scores\n", "# You can calculate mean, standard deviation, etc. for each subject\n", "math_mean = math_scores.mean()\n", "math_std = math_scores.std()\n", "\n", "reading_mean = reading_scores.mean()\n", "reading_std = reading_scores.std()\n", "\n", "writing_mean = writing_scores.mean()\n", "writing_std = writing_scores.std()\n", "\n", "# Generate synthetic scores for additional subjects (Maths, Physics, Computer Science)\n", "num_students = len(data)\n", "synthetic_maths_scores = np.round(np.random.normal(math_mean, math_std, num_students))\n", "synthetic_physics_scores = np.round(np.random.normal(reading_mean, reading_std, num_students))\n", "synthetic_cs_scores = np.round(np.random.normal(writing_mean, writing_std, num_students))\n", "\n", "# Scale the synthetic scores to fit within 0-100 range\n", "synthetic_maths_scores = np.clip(synthetic_maths_scores, 0, 100)\n", "synthetic_physics_scores = np.clip(synthetic_physics_scores, 0, 100)\n", "synthetic_cs_scores = np.clip(synthetic_cs_scores, 0, 100)\n", "\n", "# Calculate internal marks out of 30 for each subject\n", "internal_marks_scale = 30 / 100\n", "internal_maths_marks = np.round(synthetic_maths_scores * internal_marks_scale)\n", "internal_physics_marks = np.round(synthetic_physics_scores * internal_marks_scale)\n", "internal_cs_marks = np.round(synthetic_cs_scores * internal_marks_scale)\n", "\n", "# Create new DataFrame with synthetic data\n", "synthetic_data = pd.DataFrame({\n", " 'gender': data['gender'],\n", " 'race/ethnicity': data['race/ethnicity'],\n", " 'parental level of education': data['parental level of education'],\n", " 'lunch': data['lunch'],\n", " 'test preparation course': data['test preparation course'],\n", " 'math score': data['math score'],\n", " 'reading score': data['reading score'],\n", " 'writing score': data['writing score'],\n", " 'internal math marks': data['internal math marks'],\n", " 'internal reading marks': data['internal reading marks'],\n", " 'internal writing marks': data['internal writing marks'],\n", " 'physics score': synthetic_physics_scores.astype(int),\n", " 'computer science score': synthetic_cs_scores.astype(int),\n", " 'internal physics marks': internal_physics_marks.astype(int),\n", " 'internal computer science marks': internal_cs_marks.astype(int),\n", " 'maths score': synthetic_maths_scores.astype(int),\n", " 'internal maths marks': internal_maths_marks.astype(int),\n", "})\n", "\n", "# Save synthetic data to a new CSV file\n", "synthetic_data.to_csv(\"synthetic_data_with_all_subjects.csv\", index=False) # Replace with your desired file name\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/twisted-23.10.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330\u001b[0m\u001b[33m\n", "\u001b[0mCollecting faker\n", " Downloading Faker-23.2.0-py3-none-any.whl.metadata (15 kB)\n", "Requirement already satisfied: python-dateutil>=2.4 in /opt/homebrew/lib/python3.11/site-packages (from faker) (2.8.2)\n", "Requirement already satisfied: six>=1.5 in /opt/homebrew/lib/python3.11/site-packages (from python-dateutil>=2.4->faker) (1.16.0)\n", "Downloading Faker-23.2.0-py3-none-any.whl (1.7 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25h\u001b[33mDEPRECATION: colab 1.13.5 has a non-standard dependency specifier pytz>=2011n. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of colab or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", "\u001b[0mInstalling collected packages: faker\n", "Successfully installed faker-23.2.0\n" ] } ], "source": [ "!pip install faker\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Realistic names column added and saved to ../student_analysis/synthetic_data_with_all_subjects.csv\n" ] } ], "source": [ "import csv\n", "from faker import Faker\n", "\n", "def generate_realistic_name():\n", " \"\"\"Generate a realistic name.\"\"\"\n", " fake = Faker()\n", " return fake.name()\n", "\n", "def add_synthetic_column(csv_file):\n", " \"\"\"Add a synthetic column to a CSV file.\"\"\"\n", " with open(csv_file, 'r', newline='') as infile:\n", " reader = csv.reader(infile)\n", " data = list(reader)\n", " \n", " # Generate realistic names for the new column\n", " realistic_names = [generate_realistic_name() for _ in range(len(data))]\n", " \n", " # Insert the realistic names as the first column in the data\n", " data = [[realistic_names[i]] + row for i, row in enumerate(data)]\n", " \n", " # Write the updated data back to the CSV file\n", " with open(csv_file, 'w', newline='') as outfile:\n", " writer = csv.writer(outfile)\n", " writer.writerows(data)\n", "\n", "# Example usage\n", "csv_file_path = '../student_analysis/synthetic_data_with_all_subjects.csv'\n", "add_synthetic_column(csv_file_path)\n", "print(f\"Realistic names column added and saved to {csv_file_path}\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }