{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# Load the existing data\n",
    "data = pd.read_csv(\"StudentsPerformance.csv\")  # Replace \"existing_data.csv\" with your actual file path\n",
    "\n",
    "# Extract existing scores\n",
    "math_scores = data['math score']\n",
    "reading_scores = data['reading score']\n",
    "writing_scores = data['writing score']\n",
    "\n",
    "# Analyze existing scores\n",
    "# You can calculate mean, standard deviation, etc. for each subject\n",
    "math_mean = math_scores.mean()\n",
    "math_std = math_scores.std()\n",
    "\n",
    "reading_mean = reading_scores.mean()\n",
    "reading_std = reading_scores.std()\n",
    "\n",
    "writing_mean = writing_scores.mean()\n",
    "writing_std = writing_scores.std()\n",
    "\n",
    "# Generate synthetic scores for additional subjects\n",
    "num_students = len(data)\n",
    "synthetic_math_scores = np.round(np.random.normal(math_mean, math_std, num_students))\n",
    "synthetic_reading_scores = np.round(np.random.normal(reading_mean, reading_std, num_students))\n",
    "synthetic_writing_scores = np.round(np.random.normal(writing_mean, writing_std, num_students))\n",
    "\n",
    "# Scale the synthetic scores to fit within 0-100 range\n",
    "synthetic_math_scores = np.clip(synthetic_math_scores, 0, 100)\n",
    "synthetic_reading_scores = np.clip(synthetic_reading_scores, 0, 100)\n",
    "synthetic_writing_scores = np.clip(synthetic_writing_scores, 0, 100)\n",
    "\n",
    "# Calculate internal marks out of 30 for each subject\n",
    "internal_marks_scale = 30 / 100\n",
    "internal_math_marks = np.round(synthetic_math_scores * internal_marks_scale)\n",
    "internal_reading_marks = np.round(synthetic_reading_scores * internal_marks_scale)\n",
    "internal_writing_marks = np.round(synthetic_writing_scores * internal_marks_scale)\n",
    "\n",
    "# Create new DataFrame with synthetic data\n",
    "synthetic_data = pd.DataFrame({\n",
    "    'gender': data['gender'],\n",
    "    'race/ethnicity': data['race/ethnicity'],\n",
    "    'parental level of education': data['parental level of education'],\n",
    "    'lunch': data['lunch'],\n",
    "    'test preparation course': data['test preparation course'],\n",
    "    'math score': synthetic_math_scores.astype(int),\n",
    "    'reading score': synthetic_reading_scores.astype(int),\n",
    "    'writing score': synthetic_writing_scores.astype(int),\n",
    "    'internal math marks': internal_math_marks.astype(int),\n",
    "    'internal reading marks': internal_reading_marks.astype(int),\n",
    "    'internal writing marks': internal_writing_marks.astype(int)\n",
    "})\n",
    "\n",
    "# Save synthetic data to a new CSV file\n",
    "synthetic_data.to_csv(\"synthetic_data.csv\", index=False)  # Replace \"synthetic_data.csv\" with your desired file name\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# Load the existing data\n",
    "data = pd.read_csv(\"synthetic_data.csv\")  # Replace \"existing_data.csv\" with your actual file path\n",
    "\n",
    "# Extract existing scores\n",
    "math_scores = data['math score']\n",
    "reading_scores = data['reading score']\n",
    "writing_scores = data['writing score']\n",
    "\n",
    "# Analyze existing scores\n",
    "# You can calculate mean, standard deviation, etc. for each subject\n",
    "math_mean = math_scores.mean()\n",
    "math_std = math_scores.std()\n",
    "\n",
    "reading_mean = reading_scores.mean()\n",
    "reading_std = reading_scores.std()\n",
    "\n",
    "writing_mean = writing_scores.mean()\n",
    "writing_std = writing_scores.std()\n",
    "\n",
    "# Generate synthetic scores for additional subjects (Maths, Physics, Computer Science)\n",
    "num_students = len(data)\n",
    "synthetic_maths_scores = np.round(np.random.normal(math_mean, math_std, num_students))\n",
    "synthetic_physics_scores = np.round(np.random.normal(reading_mean, reading_std, num_students))\n",
    "synthetic_cs_scores = np.round(np.random.normal(writing_mean, writing_std, num_students))\n",
    "\n",
    "# Scale the synthetic scores to fit within 0-100 range\n",
    "synthetic_maths_scores = np.clip(synthetic_maths_scores, 0, 100)\n",
    "synthetic_physics_scores = np.clip(synthetic_physics_scores, 0, 100)\n",
    "synthetic_cs_scores = np.clip(synthetic_cs_scores, 0, 100)\n",
    "\n",
    "# Calculate internal marks out of 30 for each subject\n",
    "internal_marks_scale = 30 / 100\n",
    "internal_maths_marks = np.round(synthetic_maths_scores * internal_marks_scale)\n",
    "internal_physics_marks = np.round(synthetic_physics_scores * internal_marks_scale)\n",
    "internal_cs_marks = np.round(synthetic_cs_scores * internal_marks_scale)\n",
    "\n",
    "# Create new DataFrame with synthetic data\n",
    "synthetic_data = pd.DataFrame({\n",
    "    'gender': data['gender'],\n",
    "    'race/ethnicity': data['race/ethnicity'],\n",
    "    'parental level of education': data['parental level of education'],\n",
    "    'lunch': data['lunch'],\n",
    "    'test preparation course': data['test preparation course'],\n",
    "    'math score': data['math score'],\n",
    "    'reading score': data['reading score'],\n",
    "    'writing score': data['writing score'],\n",
    "    'internal math marks': data['internal math marks'],\n",
    "    'internal reading marks': data['internal reading marks'],\n",
    "    'internal writing marks': data['internal writing marks'],\n",
    "    'physics score': synthetic_physics_scores.astype(int),\n",
    "    'computer science score': synthetic_cs_scores.astype(int),\n",
    "    'internal physics marks': internal_physics_marks.astype(int),\n",
    "    'internal computer science marks': internal_cs_marks.astype(int),\n",
    "    'maths score': synthetic_maths_scores.astype(int),\n",
    "    'internal maths marks': internal_maths_marks.astype(int),\n",
    "})\n",
    "\n",
    "# Save synthetic data to a new CSV file\n",
    "synthetic_data.to_csv(\"synthetic_data_with_all_subjects.csv\", index=False)  # Replace with your desired file name\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/twisted-23.10.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330\u001b[0m\u001b[33m\n",
      "\u001b[0mCollecting faker\n",
      "  Downloading Faker-23.2.0-py3-none-any.whl.metadata (15 kB)\n",
      "Requirement already satisfied: python-dateutil>=2.4 in /opt/homebrew/lib/python3.11/site-packages (from faker) (2.8.2)\n",
      "Requirement already satisfied: six>=1.5 in /opt/homebrew/lib/python3.11/site-packages (from python-dateutil>=2.4->faker) (1.16.0)\n",
      "Downloading Faker-23.2.0-py3-none-any.whl (1.7 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25h\u001b[33mDEPRECATION: colab 1.13.5 has a non-standard dependency specifier pytz>=2011n. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of colab or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n",
      "\u001b[0mInstalling collected packages: faker\n",
      "Successfully installed faker-23.2.0\n"
     ]
    }
   ],
   "source": [
    "!pip install faker\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Realistic names column added and saved to ../student_analysis/synthetic_data_with_all_subjects.csv\n"
     ]
    }
   ],
   "source": [
    "import csv\n",
    "from faker import Faker\n",
    "\n",
    "def generate_realistic_name():\n",
    "    \"\"\"Generate a realistic name.\"\"\"\n",
    "    fake = Faker()\n",
    "    return fake.name()\n",
    "\n",
    "def add_synthetic_column(csv_file):\n",
    "    \"\"\"Add a synthetic column to a CSV file.\"\"\"\n",
    "    with open(csv_file, 'r', newline='') as infile:\n",
    "        reader = csv.reader(infile)\n",
    "        data = list(reader)\n",
    "    \n",
    "    # Generate realistic names for the new column\n",
    "    realistic_names = [generate_realistic_name() for _ in range(len(data))]\n",
    "    \n",
    "    # Insert the realistic names as the first column in the data\n",
    "    data = [[realistic_names[i]] + row for i, row in enumerate(data)]\n",
    "    \n",
    "    # Write the updated data back to the CSV file\n",
    "    with open(csv_file, 'w', newline='') as outfile:\n",
    "        writer = csv.writer(outfile)\n",
    "        writer.writerows(data)\n",
    "\n",
    "# Example usage\n",
    "csv_file_path = '../student_analysis/synthetic_data_with_all_subjects.csv'\n",
    "add_synthetic_column(csv_file_path)\n",
    "print(f\"Realistic names column added and saved to {csv_file_path}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}