{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "\n",
    "def normalize_run_name(run_name):\n",
    "    return run_name.replace(\"/\", \"_\")\n",
    "\n",
    "def save_for_plot(dir_name, df, run_names, xlabel=\"Dataset\", ylabel=\"Matched as dups probability\", plot_name=\"plot name\", custom_layout={}, ranges={}, x_column=None, default_metric=None):\n",
    "    import os\n",
    "    files = {}\n",
    "    os.makedirs(f\"data/plots/{dir_name}\", exist_ok=True)\n",
    "    data = {}\n",
    "    for run_name in run_names:\n",
    "        data[run_name] = {\n",
    "            \"x\": df[x_column].tolist() if x_column else [run_name],\n",
    "            \"y\": df[run_name].tolist(),\n",
    "            \"label\": run_name,\n",
    "        }\n",
    "    file_name = f\"default.json\"\n",
    "    files[\"default\"] = {\"file\": f\"{file_name}\"}\n",
    "    with open(f\"data/plots/{dir_name}/{file_name}\", \"w\") as f:\n",
    "        json.dump({\n",
    "            \"data\": data,\n",
    "            \"layout\": {\n",
    "                \"title\": {\n",
    "                    \"text\": plot_name,\n",
    "                },\n",
    "                \"xaxis\": {\n",
    "                    \"title\": {\n",
    "                        \"text\": xlabel,\n",
    "                    },\n",
    "                },\n",
    "                \"yaxis\": {\n",
    "                    # \"range\": ranges.get(view, None),\n",
    "                    \"title\": {\n",
    "                        \"text\": ylabel,\n",
    "                    },\n",
    "                },\n",
    "                **custom_layout,\n",
    "            }\n",
    "        }, f)\n",
    "    with open(f\"data/plots/{dir_name}/index.json\", \"w\") as f:\n",
    "        json.dump({\n",
    "            \"files\": files,\n",
    "            \"settings\": {\n",
    "                \"defaultMetric\": default_metric,\n",
    "                \"slider\": None,\n",
    "                \"autoSetXRange\": False,\n",
    "                \"type\": \"bar\"\n",
    "            }\n",
    "        }, f)\n",
    "    return files\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4-8</th>\n",
       "      <th>8-16</th>\n",
       "      <th>16-32</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1B</th>\n",
       "      <td>0.994974</td>\n",
       "      <td>0.005008</td>\n",
       "      <td>0.000018</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10B</th>\n",
       "      <td>0.951508</td>\n",
       "      <td>0.047331</td>\n",
       "      <td>0.001144</td>\n",
       "      <td>0.000017</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100B</th>\n",
       "      <td>0.608873</td>\n",
       "      <td>0.302822</td>\n",
       "      <td>0.074548</td>\n",
       "      <td>0.013757</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350B</th>\n",
       "      <td>0.174147</td>\n",
       "      <td>0.30712</td>\n",
       "      <td>0.268018</td>\n",
       "      <td>0.250649</td>\n",
       "      <td>0.000065</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1T</th>\n",
       "      <td>0.006232</td>\n",
       "      <td>0.03247</td>\n",
       "      <td>0.083743</td>\n",
       "      <td>0.817636</td>\n",
       "      <td>0.05991</td>\n",
       "      <td>0.000008</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             1         2         3       4-8      8-16     16-32\n",
       "1B    0.994974  0.005008  0.000018       0.0       0.0       0.0\n",
       "10B   0.951508  0.047331  0.001144  0.000017       0.0       0.0\n",
       "100B  0.608873  0.302822  0.074548  0.013757       0.0       0.0\n",
       "350B  0.174147   0.30712  0.268018  0.250649  0.000065       0.0\n",
       "1T    0.006232   0.03247  0.083743  0.817636   0.05991  0.000008"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4-8</th>\n",
       "      <th>8-16</th>\n",
       "      <th>16-32</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1B</td>\n",
       "      <td>0.994974</td>\n",
       "      <td>0.005008</td>\n",
       "      <td>0.000018</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10B</td>\n",
       "      <td>0.951508</td>\n",
       "      <td>0.047331</td>\n",
       "      <td>0.001144</td>\n",
       "      <td>0.000017</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>100B</td>\n",
       "      <td>0.608873</td>\n",
       "      <td>0.302822</td>\n",
       "      <td>0.074548</td>\n",
       "      <td>0.013757</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>350B</td>\n",
       "      <td>0.174147</td>\n",
       "      <td>0.30712</td>\n",
       "      <td>0.268018</td>\n",
       "      <td>0.250649</td>\n",
       "      <td>0.000065</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1T</td>\n",
       "      <td>0.006232</td>\n",
       "      <td>0.03247</td>\n",
       "      <td>0.083743</td>\n",
       "      <td>0.817636</td>\n",
       "      <td>0.05991</td>\n",
       "      <td>0.000008</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  index         1         2         3       4-8      8-16     16-32\n",
       "0    1B  0.994974  0.005008  0.000018       0.0       0.0       0.0\n",
       "1   10B  0.951508  0.047331  0.001144  0.000017       0.0       0.0\n",
       "2  100B  0.608873  0.302822  0.074548  0.013757       0.0       0.0\n",
       "3  350B  0.174147   0.30712  0.268018  0.250649  0.000065       0.0\n",
       "4    1T  0.006232   0.03247  0.083743  0.817636   0.05991  0.000008"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summarized_df.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'default': {'file': 'default.json'}}"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "df = pd.read_csv(\"./data/duplicates-simulation.csv\", index_col=0)\n",
    "\n",
    "\n",
    "def summarize_ranges(df):\n",
    "    df_summarized = pd.DataFrame(\n",
    "        index=[\"1\", \"2\", \"3\", \"4-8\", \"8-16\", \"16-32\"], columns=df.columns\n",
    "    )\n",
    "    df_summarized.loc[\"1\"] = df.iloc[0]\n",
    "    df_summarized.loc[\"2\"] = df.iloc[1]\n",
    "    df_summarized.loc[\"3\"] = df.iloc[2]\n",
    "    df_summarized.loc[\"4-8\"] = df.iloc[3:9].sum()\n",
    "    df_summarized.loc[\"8-16\"] = df.iloc[9:17].sum()\n",
    "    df_summarized.loc[\"16-32\"] = df.iloc[17:].sum()\n",
    "    return df_summarized\n",
    "\n",
    "\n",
    "summarized_df = summarize_ranges(df).T\n",
    "cols = summarized_df.columns\n",
    "summarized_df.reset_index(inplace=True)\n",
    "save_for_plot(\n",
    "    \"duplicates-simul\",\n",
    "    summarized_df,\n",
    "    cols,\n",
    "    x_column=\"index\",\n",
    "    plot_name=\"Sampling from 1000 identical buckets with 200B tokens each\",\n",
    "    ylabel=\"Dataset fraction\",\n",
    "    xlabel=\"Sample size\",\n",
    "    default_metric=\"default\",\n",
    "    custom_layout={\n",
    "        \"barmode\": \"stack\",\n",
    "        \"legend\": {\n",
    "            \"title\": {\n",
    "                \"text\": \"# duplicates\",\n",
    "                \"font\": {\n",
    "                    \"size\": 14,\n",
    "                    \"weight\": \"bold\",\n",
    "                }\n",
    "            },\n",
    "            \"font\": {\n",
    "                \"size\": 14,\n",
    "            },\n",
    "            \"bgcolor\": 'rgba(255, 255, 255, 0.9)',\n",
    "            # \"borderwidth\": 1,\n",
    "            \"orientation\": \"v\",\n",
    "            \"xanchor\": \"left\",\n",
    "            \"yanchor\": \"bottom\",\n",
    "            \"x\": 0.01,\n",
    "            \"y\": 0,\n",
    "        },\n",
    "    },\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'index'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
      "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: 'index'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[17], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Take the sumarized_df and pivot it\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mindex\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnum_duplicates\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mduplicates_prob\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/frame.py:9326\u001b[0m, in \u001b[0;36mDataFrame.pivot\u001b[0;34m(self, columns, index, values)\u001b[0m\n\u001b[1;32m   9319\u001b[0m \u001b[38;5;129m@Substitution\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   9320\u001b[0m \u001b[38;5;129m@Appender\u001b[39m(_shared_docs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpivot\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m   9321\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpivot\u001b[39m(\n\u001b[1;32m   9322\u001b[0m     \u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m, columns, index\u001b[38;5;241m=\u001b[39mlib\u001b[38;5;241m.\u001b[39mno_default, values\u001b[38;5;241m=\u001b[39mlib\u001b[38;5;241m.\u001b[39mno_default\n\u001b[1;32m   9323\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[1;32m   9324\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreshape\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpivot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pivot\n\u001b[0;32m-> 9326\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/reshape/pivot.py:553\u001b[0m, in \u001b[0;36mpivot\u001b[0;34m(data, columns, index, values)\u001b[0m\n\u001b[1;32m    549\u001b[0m         index_list \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    550\u001b[0m             data\u001b[38;5;241m.\u001b[39m_constructor_sliced(data\u001b[38;5;241m.\u001b[39mindex, name\u001b[38;5;241m=\u001b[39mdata\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m    551\u001b[0m         ]\n\u001b[1;32m    552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 553\u001b[0m     index_list \u001b[38;5;241m=\u001b[39m [\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m com\u001b[38;5;241m.\u001b[39mconvert_to_list_like(index)]\n\u001b[1;32m    555\u001b[0m data_columns \u001b[38;5;241m=\u001b[39m [data[col] \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m columns_listlike]\n\u001b[1;32m    556\u001b[0m index_list\u001b[38;5;241m.\u001b[39mextend(data_columns)\n",
      "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/frame.py:4090\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   4088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   4089\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4090\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   4091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m   4092\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
      "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3807\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m   3808\u001b[0m         \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m   3809\u001b[0m         \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m   3810\u001b[0m     ):\n\u001b[1;32m   3811\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m   3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m   3814\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m   3815\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m   3816\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[1;32m   3817\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
      "\u001b[0;31mKeyError\u001b[0m: 'index'"
     ]
    }
   ],
   "source": [
    "# Take the sumarized_df and pivotdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "plotlyServerURL": "https://plot.ly"
       },
       "data": [
        {
         "name": "1",
         "type": "bar",
         "x": [
          "1B",
          "10B",
          "100B",
          "350B",
          "1T"
         ],
         "y": [
          0.994974,
          0.9515081,
          0.60887281,
          0.1741474885714285,
          0.006232416
         ]
        },
        {
         "name": "2",
         "type": "bar",
         "x": [
          "1B",
          "10B",
          "100B",
          "350B",
          "1T"
         ],
         "y": [
          0.005008,
          0.047331,
          0.30282154,
          0.3071204342857143,
          0.032470074
         ]
        },
        {
         "name": "3",
         "type": "bar",
         "x": [
          "1B",
          "10B",
          "100B",
          "350B",
          "1T"
         ],
         "y": [
          0.000018,
          0.0011439,
          0.0745482,
          0.2680183371428571,
          0.083742993
         ]
        },
        {
         "name": "4-8",
         "type": "bar",
         "x": [
          "1B",
          "10B",
          "100B",
          "350B",
          "1T"
         ],
         "y": [
          0,
          0.000017,
          0.01375745,
          0.25064894285714273,
          0.8176358810000001
         ]
        },
        {
         "name": "8-16",
         "type": "bar",
         "x": [
          "1B",
          "10B",
          "100B",
          "350B",
          "1T"
         ],
         "y": [
          0,
          0,
          0,
          0.00006479714285714286,
          0.05991048400000001
         ]
        },
        {
         "name": "16-32",
         "type": "bar",
         "x": [
          "1B",
          "10B",
          "100B",
          "350B",
          "1T"
         ],
         "y": [
          0,
          0,
          0,
          0,
          0.000008152000000000001
         ]
        }
       ],
       "layout": {
        "barmode": "stack",
        "legend": {
         "title": {
          "text": "# duplicates"
         }
        },
        "template": {
         "data": {
          "bar": [
           {
            "error_x": {
             "color": "#2a3f5f"
            },
            "error_y": {
             "color": "#2a3f5f"
            },
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "bar"
           }
          ],
          "barpolar": [
           {
            "marker": {
             "line": {
              "color": "#E5ECF6",
              "width": 0.5
             },
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "barpolar"
           }
          ],
          "carpet": [
           {
            "aaxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "baxis": {
             "endlinecolor": "#2a3f5f",
             "gridcolor": "white",
             "linecolor": "white",
             "minorgridcolor": "white",
             "startlinecolor": "#2a3f5f"
            },
            "type": "carpet"
           }
          ],
          "choropleth": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "choropleth"
           }
          ],
          "contour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "contour"
           }
          ],
          "contourcarpet": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "contourcarpet"
           }
          ],
          "heatmap": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmap"
           }
          ],
          "heatmapgl": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "heatmapgl"
           }
          ],
          "histogram": [
           {
            "marker": {
             "pattern": {
              "fillmode": "overlay",
              "size": 10,
              "solidity": 0.2
             }
            },
            "type": "histogram"
           }
          ],
          "histogram2d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2d"
           }
          ],
          "histogram2dcontour": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "histogram2dcontour"
           }
          ],
          "mesh3d": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "type": "mesh3d"
           }
          ],
          "parcoords": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "parcoords"
           }
          ],
          "pie": [
           {
            "automargin": true,
            "type": "pie"
           }
          ],
          "scatter": [
           {
            "fillpattern": {
             "fillmode": "overlay",
             "size": 10,
             "solidity": 0.2
            },
            "type": "scatter"
           }
          ],
          "scatter3d": [
           {
            "line": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatter3d"
           }
          ],
          "scattercarpet": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattercarpet"
           }
          ],
          "scattergeo": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergeo"
           }
          ],
          "scattergl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattergl"
           }
          ],
          "scattermapbox": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scattermapbox"
           }
          ],
          "scatterpolar": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolar"
           }
          ],
          "scatterpolargl": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterpolargl"
           }
          ],
          "scatterternary": [
           {
            "marker": {
             "colorbar": {
              "outlinewidth": 0,
              "ticks": ""
             }
            },
            "type": "scatterternary"
           }
          ],
          "surface": [
           {
            "colorbar": {
             "outlinewidth": 0,
             "ticks": ""
            },
            "colorscale": [
             [
              0,
              "#0d0887"
             ],
             [
              0.1111111111111111,
              "#46039f"
             ],
             [
              0.2222222222222222,
              "#7201a8"
             ],
             [
              0.3333333333333333,
              "#9c179e"
             ],
             [
              0.4444444444444444,
              "#bd3786"
             ],
             [
              0.5555555555555556,
              "#d8576b"
             ],
             [
              0.6666666666666666,
              "#ed7953"
             ],
             [
              0.7777777777777778,
              "#fb9f3a"
             ],
             [
              0.8888888888888888,
              "#fdca26"
             ],
             [
              1,
              "#f0f921"
             ]
            ],
            "type": "surface"
           }
          ],
          "table": [
           {
            "cells": {
             "fill": {
              "color": "#EBF0F8"
             },
             "line": {
              "color": "white"
             }
            },
            "header": {
             "fill": {
              "color": "#C8D4E3"
             },
             "line": {
              "color": "white"
             }
            },
            "type": "table"
           }
          ]
         },
         "layout": {
          "annotationdefaults": {
           "arrowcolor": "#2a3f5f",
           "arrowhead": 0,
           "arrowwidth": 1
          },
          "autotypenumbers": "strict",
          "coloraxis": {
           "colorbar": {
            "outlinewidth": 0,
            "ticks": ""
           }
          },
          "colorscale": {
           "diverging": [
            [
             0,
             "#8e0152"
            ],
            [
             0.1,
             "#c51b7d"
            ],
            [
             0.2,
             "#de77ae"
            ],
            [
             0.3,
             "#f1b6da"
            ],
            [
             0.4,
             "#fde0ef"
            ],
            [
             0.5,
             "#f7f7f7"
            ],
            [
             0.6,
             "#e6f5d0"
            ],
            [
             0.7,
             "#b8e186"
            ],
            [
             0.8,
             "#7fbc41"
            ],
            [
             0.9,
             "#4d9221"
            ],
            [
             1,
             "#276419"
            ]
           ],
           "sequential": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ],
           "sequentialminus": [
            [
             0,
             "#0d0887"
            ],
            [
             0.1111111111111111,
             "#46039f"
            ],
            [
             0.2222222222222222,
             "#7201a8"
            ],
            [
             0.3333333333333333,
             "#9c179e"
            ],
            [
             0.4444444444444444,
             "#bd3786"
            ],
            [
             0.5555555555555556,
             "#d8576b"
            ],
            [
             0.6666666666666666,
             "#ed7953"
            ],
            [
             0.7777777777777778,
             "#fb9f3a"
            ],
            [
             0.8888888888888888,
             "#fdca26"
            ],
            [
             1,
             "#f0f921"
            ]
           ]
          },
          "colorway": [
           "#636efa",
           "#EF553B",
           "#00cc96",
           "#ab63fa",
           "#FFA15A",
           "#19d3f3",
           "#FF6692",
           "#B6E880",
           "#FF97FF",
           "#FECB52"
          ],
          "font": {
           "color": "#2a3f5f"
          },
          "geo": {
           "bgcolor": "white",
           "lakecolor": "white",
           "landcolor": "#E5ECF6",
           "showlakes": true,
           "showland": true,
           "subunitcolor": "white"
          },
          "hoverlabel": {
           "align": "left"
          },
          "hovermode": "closest",
          "mapbox": {
           "style": "light"
          },
          "paper_bgcolor": "white",
          "plot_bgcolor": "#E5ECF6",
          "polar": {
           "angularaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "radialaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "scene": {
           "xaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "yaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           },
           "zaxis": {
            "backgroundcolor": "#E5ECF6",
            "gridcolor": "white",
            "gridwidth": 2,
            "linecolor": "white",
            "showbackground": true,
            "ticks": "",
            "zerolinecolor": "white"
           }
          },
          "shapedefaults": {
           "line": {
            "color": "#2a3f5f"
           }
          },
          "ternary": {
           "aaxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "baxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           },
           "bgcolor": "#E5ECF6",
           "caxis": {
            "gridcolor": "white",
            "linecolor": "white",
            "ticks": ""
           }
          },
          "title": {
           "x": 0.05
          },
          "xaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          },
          "yaxis": {
           "automargin": true,
           "gridcolor": "white",
           "linecolor": "white",
           "ticks": "",
           "title": {
            "standoff": 15
           },
           "zerolinecolor": "white",
           "zerolinewidth": 2
          }
         }
        },
        "title": {
         "text": "Sampling from 100 identical buckets with 200B tokens each"
        },
        "xaxis": {
         "title": {
          "text": "Sample size"
         }
        },
        "yaxis": {
         "range": [
          0,
          1.000001
         ],
         "title": {
          "text": "Dataset fraction"
         }
        }
       }
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "df = pd.read_csv(\"./data/duplicates-simulation.csv\", index_col=0)\n",
    "\n",
    "def summarize_ranges(df):\n",
    "    df_summarized = pd.DataFrame(index=['1', '2', '3', '4-8', '8-16', '16-32'], columns=df.columns)\n",
    "    df_summarized.loc['1'] = df.iloc[0]\n",
    "    df_summarized.loc['2'] = df.iloc[1]\n",
    "    df_summarized.loc['3'] = df.iloc[2]\n",
    "    df_summarized.loc['4-8'] = df.iloc[3:9].sum()\n",
    "    df_summarized.loc['8-16'] = df.iloc[9:17].sum()\n",
    "    df_summarized.loc['16-32'] = df.iloc[17:].sum()\n",
    "    return df_summarized\n",
    "\n",
    "summarized_df = summarize_ranges(df).T\n",
    "\n",
    "# Create a stacked bar chart using Plotly\n",
    "fig = go.Figure()\n",
    "for col in summarized_df.columns:\n",
    "    fig.add_trace(go.Bar(\n",
    "        x=summarized_df.index,\n",
    "        y=summarized_df[col],\n",
    "        name=col\n",
    "    ))\n",
    "\n",
    "fig.update_layout(\n",
    "    barmode='stack',\n",
    "    title_text=\"Sampling from 100 identical buckets with 200B tokens each\",\n",
    "    xaxis_title=\"Sample size\",\n",
    "    yaxis_title=\"Dataset fraction\",\n",
    "    yaxis=dict(range=[0, 1.000001]),\n",
    "    legend_title=\"# duplicates\",\n",
    ")\n",
    "\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['1B', '10B', '100B', '350B', '1T'], dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summarized_df.index"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "datatrove",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}