Spaces:

toandaominh1997
/

labresearch

Runtime error

File size: 6,995 Bytes

4a89467

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "06391735-12f3-45dd-988f-28559ca176f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np \n",
    "import pandas as pd\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "a0544a40-d98a-4b01-84dd-f74c8c373749",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(98000, 30)"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv('./data.csv')\n",
    "# sub = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2022/sample_submission.csv')\n",
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "ba3107bc-49c1-4aff-a86d-1b0da2e9f84a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn import cluster \n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.manifold import TSNE\n",
    "from yellowbrick.cluster import KElbowVisualizer\n",
    "from sklearn.preprocessing import QuantileTransformer\n",
    "from sklearn.metrics import silhouette_score, silhouette_samples\n",
    "\n",
    "\n",
    "X = data.drop(columns = ['id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24040c75-4a16-4d62-9b78-6a5af14fe560",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m[I 2022-07-03 16:47:32,023]\u001b[0m A new study created in memory with name: tonne\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "import optuna\n",
    "pipe = Pipeline(steps = [\n",
    "    ('imputer', SimpleImputer()),\n",
    "    ('preprocessing', QuantileTransformer()),\n",
    "    ('reduce_dimenstion', PCA(n_components = 10)),\n",
    "    ('estimator', cluster.KMeans(n_clusters = 6))\n",
    "])\n",
    "\n",
    "rds = {\n",
    "    'pca': PCA(),\n",
    "    'tsne': TSNE()\n",
    "}\n",
    "estimators = {\n",
    "    'kmeans': cluster.KMeans(),\n",
    "    'agg': cluster.AgglomerativeClustering()\n",
    "}\n",
    "def objective(trial):\n",
    "    params = {}\n",
    "    \n",
    "    rd_name = trial.suggest_categorical('rd', ['pca'])\n",
    "    params['reduce_dimenstion'] = rds[rd_name]\n",
    "    n_components = trial.suggest_int('n_components', 7, 20)\n",
    "    params['reduce_dimenstion__n_components'] = n_components\n",
    "    \n",
    "    estimator_name = trial.suggest_categorical('cluster', ['kmeans'])\n",
    "    params['estimator'] = estimators[estimator_name]\n",
    "    \n",
    "    if estimator_name in ['kmeans', 'agg']:\n",
    "        n_clusters = trial.suggest_int('n_clusters', 4, 12)\n",
    "        params['estimator__n_clusters'] = n_clusters\n",
    "    \n",
    "    pipe.set_params(**params)\n",
    "    pipe.fit(X)\n",
    "    y_pred = pipe.predict(X)\n",
    "    sh_score = silhouette_score(pipe[:-1].transform(X), y_pred)\n",
    "    return sh_score\n",
    "    \n",
    "study = optuna.create_study(study_name = 'tonne',\n",
    "                            direction='maximize')\n",
    "study.optimize(objective, \n",
    "               n_trials=10,\n",
    "               timeout = 5000,\n",
    "               # n_jobs = 2,\n",
    "              )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "f556b049-5d9f-4c1b-9d7f-d0f176e9dece",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'rd': 'pca', 'n_components': 7, 'cluster': 'kmeans', 'n_clusters': 11}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "best_params = study.best_params\n",
    "best_params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "8c1a1766-2cca-41e4-83a7-749a7d8a8d47",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Predicted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Id  Predicted\n",
       "0   0          6\n",
       "1   1          0\n",
       "2   2          2\n",
       "3   3          1\n",
       "4   4          5"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub = pd.DataFrame()\n",
    "\n",
    "sub['Id'] = data.id\n",
    "sub['Predicted'] = pipe.predict(X)\n",
    "sub.to_csv('submit.csv', index = False)\n",
    "sub.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "e17420e6-29ea-4f56-b9cc-233c31f18066",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(42351, 30)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28370b1d-7eb9-4e9d-8b61-74b1910070ac",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}