{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "from sklearn.model_selection import train_test_split\n", "import os\n", "\n", "DATA_DIR = os.path.join(\"..\", \"data\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/v8/0hd98b512cn3ms2rz146k7jw0000gn/T/ipykernel_41770/685274063.py:1: DtypeWarning: Columns (481,482,483) have mixed types. Specify dtype option on import or set low_memory=False.\n", " detailed_games_df = pd.read_csv(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 377608 entries, 0 to 377607\n", "Columns: 487 entries, Unnamed: 0 to ChalkSeed\n", "dtypes: float64(347), int64(133), object(7)\n", "memory usage: 1.4+ GB\n" ] } ], "source": [ "detailed_games_df = pd.read_csv(\n", " os.path.join(DATA_DIR, \"AllSuperDetailedGames.csv\"),\n", ")\n", "\n", "detailed_games_df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Split Mens & Womens data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "mens_games_df = detailed_games_df[detailed_games_df[\"League\"] == \"M\"]\n", "wmns_games_df = detailed_games_df[detailed_games_df[\"League\"] == \"W\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define Features, Targets, and register data on device" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# define the features and target for our models\n", "feature_cols = [\n", " \"ScoreDiff mean reg\",\n", " \"FGMDiff mean reg\",\n", " \"FGM3Diff mean reg\",\n", " \"TODiff mean reg\",\n", "\n", " \"OppScore mean reg\",\n", " \"OppFGM mean reg\",\n", " \"OppFGM3 mean reg\",\n", " \"OppTO mean reg\",\n", "]\n", "\n", "target_cols = [\"Win\"]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# split into training and testing datasets\n", "MX_train, MX_test, My_train, My_test = train_test_split(\n", " mens_games_df[feature_cols],\n", " mens_games_df[target_cols],\n", " test_size=0.2,\n", " random_state=1,\n", ")\n", "\n", "# same for womens\n", "WX_train, WX_test, Wy_train, Wy_test = train_test_split(\n", " wmns_games_df[feature_cols],\n", " wmns_games_df[target_cols],\n", " test_size=0.2,\n", " random_state=1,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# convert data to tensor objects and register to device\n", "# DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "def get_device() -> str:\n", " if torch.cuda.is_available():\n", " return \"cuda\"\n", " if torch.backends.mps.is_available():\n", " return \"mps\"\n", " return \"cpu\"\n", "\n", "DEVICE = get_device()\n", "print(DEVICE)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "\n", "MX_train_T = torch.tensor(\n", " MX_train.astype(float).values,\n", " dtype=torch.float32,\n", ").to(DEVICE)\n", "\n", "MX_test_T = torch.tensor(\n", " MX_test.astype(float).values,\n", " dtype=torch.float32,\n", ").to(DEVICE)\n", "\n", "My_train_T = torch.tensor(\n", " My_train.astype(float).values,\n", " dtype=torch.float32,\n", ").to(DEVICE)\n", "\n", "My_test_T = torch.tensor(\n", " My_test.astype(float).values,\n", " dtype=torch.float32,\n", ").to(DEVICE)\n", "\n", "# # same for womens data\n", "WX_train_T = torch.tensor(\n", " WX_train.values,\n", " dtype=torch.float32,\n", ").to(DEVICE)\n", "\n", "WX_test_T = torch.tensor(\n", " WX_test.values,\n", " dtype=torch.float32,\n", ").to(DEVICE)\n", "\n", "Wy_train_T = torch.tensor(\n", " Wy_train.values,\n", " dtype=torch.float32,\n", ").to(DEVICE)\n", "\n", "Wy_test_T = torch.tensor(\n", " Wy_test.values,\n", " dtype=torch.float32,\n", ").to(DEVICE)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generic Neural Network Framework\n", "\n", "I am using the same neural network structure for both the mens and womens data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "num_features = len(feature_cols)\n", "\n", "class NiglNN(nn.Module):\n", " def __init__(self):\n", " super().__init__()\n", " self.activation_func = nn.Sigmoid()\n", " self.layer1 = nn.Linear(num_features, 64) \n", " self.layer2 = nn.Linear(64, 32)\n", " self.layer3 = nn.Linear(32, 16)\n", " self.layer4 = nn.Linear(16, 8)\n", " self.layer5 = nn.Linear(8, 4)\n", " self.layer6 = nn.Linear(4, 1)\n", "\n", " def forward(self, x: torch.Tensor):\n", " x = self.layer1(x)\n", " x = self.activation_func(x)\n", " x = self.layer2(x)\n", " x = self.activation_func(x)\n", " x = self.layer3(x)\n", " x = self.activation_func(x)\n", " x = self.layer4(x)\n", " x = self.activation_func(x)\n", " x = self.layer5(x)\n", " x = self.activation_func(x)\n", " x = self.layer6(x)\n", " x = self.activation_func(x)\n", " return x\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1000 / 10000] Binary Cross Entropy: 0.6770758628845215\n", "[2000 / 10000] Binary Cross Entropy: 0.6671037077903748\n", "[3000 / 10000] Binary Cross Entropy: 0.6648934483528137\n", "[4000 / 10000] Binary Cross Entropy: 0.6640341281890869\n", "[5000 / 10000] Binary Cross Entropy: 0.663619875907898\n", "[6000 / 10000] Binary Cross Entropy: 0.6633755564689636\n", "[7000 / 10000] Binary Cross Entropy: 0.6631807088851929\n", "[8000 / 10000] Binary Cross Entropy: 0.663043200969696\n", "[9000 / 10000] Binary Cross Entropy: 0.6629269123077393\n", "[10000 / 10000] Binary Cross Entropy: 0.6629060506820679\n" ] } ], "source": [ "# mens training loop\n", "torch.manual_seed(2)\n", "\n", "epochs = 10_000\n", "nigl10k = NiglNN().to(DEVICE)\n", "loss_fn = nn.BCEWithLogitsLoss()\n", "optimizer = optim.Adam(\n", " lr=0.001,\n", " params=nigl10k.parameters(),\n", ")\n", "\n", "for epoch in range(1, epochs + 1):\n", " optimizer.zero_grad()\n", " pred = nigl10k(MX_train_T)\n", " loss = loss_fn(pred, My_train_T) \n", " loss.backward()\n", " optimizer.step()\n", "\n", " if epoch % 1_000 == 0:\n", " print(f\"[{epoch} / {epochs}] Binary Cross Entropy: {loss.item()}\")\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Binary Cross Entropy: 0.6655928492546082\n" ] } ], "source": [ "nigl10k.eval()\n", "\n", "with torch.no_grad():\n", " pred = nigl10k(MX_test_T)\n", " loss = loss_fn(pred, My_test_T)\n", " print(f\"Binary Cross Entropy: {loss.item()}\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# save model\n", "MODEL_DIR = os.path.join(\"..\", \"models\")\n", "\n", "torch.save(\n", " nigl10k,\n", " os.path.join(MODEL_DIR, \"nn10k.pth\"),\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }