{ "cells": [ { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "\n", "def normalize_run_name(run_name):\n", " return run_name.replace(\"/\", \"_\")\n", "\n", "def save_for_plot(dir_name, df, run_names, xlabel=\"Dataset\", ylabel=\"Matched as dups probability\", plot_name=\"plot name\", custom_layout={}, ranges={}, x_column=None, default_metric=None):\n", " import os\n", " files = {}\n", " os.makedirs(f\"data/plots/{dir_name}\", exist_ok=True)\n", " data = {}\n", " for run_name in run_names:\n", " data[run_name] = {\n", " \"x\": df[x_column].tolist() if x_column else [run_name],\n", " \"y\": df[run_name].tolist(),\n", " \"label\": run_name,\n", " }\n", " file_name = f\"default.json\"\n", " files[\"default\"] = {\"file\": f\"{file_name}\"}\n", " with open(f\"data/plots/{dir_name}/{file_name}\", \"w\") as f:\n", " json.dump({\n", " \"data\": data,\n", " \"layout\": {\n", " \"title\": {\n", " \"text\": plot_name,\n", " },\n", " \"xaxis\": {\n", " \"title\": {\n", " \"text\": xlabel,\n", " },\n", " },\n", " \"yaxis\": {\n", " # \"range\": ranges.get(view, None),\n", " \"title\": {\n", " \"text\": ylabel,\n", " },\n", " },\n", " **custom_layout,\n", " }\n", " }, f)\n", " with open(f\"data/plots/{dir_name}/index.json\", \"w\") as f:\n", " json.dump({\n", " \"files\": files,\n", " \"settings\": {\n", " \"defaultMetric\": default_metric,\n", " \"slider\": None,\n", " \"autoSetXRange\": False,\n", " \"type\": \"bar\"\n", " }\n", " }, f)\n", " return files\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1234-88-1616-32
1B0.9949740.0050080.0000180.00.00.0
10B0.9515080.0473310.0011440.0000170.00.0
100B0.6088730.3028220.0745480.0137570.00.0
350B0.1741470.307120.2680180.2506490.0000650.0
1T0.0062320.032470.0837430.8176360.059910.000008
\n", "
" ], "text/plain": [ " 1 2 3 4-8 8-16 16-32\n", "1B 0.994974 0.005008 0.000018 0.0 0.0 0.0\n", "10B 0.951508 0.047331 0.001144 0.000017 0.0 0.0\n", "100B 0.608873 0.302822 0.074548 0.013757 0.0 0.0\n", "350B 0.174147 0.30712 0.268018 0.250649 0.000065 0.0\n", "1T 0.006232 0.03247 0.083743 0.817636 0.05991 0.000008" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
index1234-88-1616-32
01B0.9949740.0050080.0000180.00.00.0
110B0.9515080.0473310.0011440.0000170.00.0
2100B0.6088730.3028220.0745480.0137570.00.0
3350B0.1741470.307120.2680180.2506490.0000650.0
41T0.0062320.032470.0837430.8176360.059910.000008
\n", "
" ], "text/plain": [ " index 1 2 3 4-8 8-16 16-32\n", "0 1B 0.994974 0.005008 0.000018 0.0 0.0 0.0\n", "1 10B 0.951508 0.047331 0.001144 0.000017 0.0 0.0\n", "2 100B 0.608873 0.302822 0.074548 0.013757 0.0 0.0\n", "3 350B 0.174147 0.30712 0.268018 0.250649 0.000065 0.0\n", "4 1T 0.006232 0.03247 0.083743 0.817636 0.05991 0.000008" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summarized_df.reset_index()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'default': {'file': 'default.json'}}" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "\n", "df = pd.read_csv(\"./data/duplicates-simulation.csv\", index_col=0)\n", "\n", "\n", "def summarize_ranges(df):\n", " df_summarized = pd.DataFrame(\n", " index=[\"1\", \"2\", \"3\", \"4-8\", \"8-16\", \"16-32\"], columns=df.columns\n", " )\n", " df_summarized.loc[\"1\"] = df.iloc[0]\n", " df_summarized.loc[\"2\"] = df.iloc[1]\n", " df_summarized.loc[\"3\"] = df.iloc[2]\n", " df_summarized.loc[\"4-8\"] = df.iloc[3:9].sum()\n", " df_summarized.loc[\"8-16\"] = df.iloc[9:17].sum()\n", " df_summarized.loc[\"16-32\"] = df.iloc[17:].sum()\n", " return df_summarized\n", "\n", "\n", "summarized_df = summarize_ranges(df).T\n", "cols = summarized_df.columns\n", "summarized_df.reset_index(inplace=True)\n", "save_for_plot(\n", " \"duplicates-simul\",\n", " summarized_df,\n", " cols,\n", " x_column=\"index\",\n", " plot_name=\"Sampling from 1000 identical buckets with 200B tokens each\",\n", " ylabel=\"Dataset fraction\",\n", " xlabel=\"Sample size\",\n", " default_metric=\"default\",\n", " custom_layout={\n", " \"barmode\": \"stack\",\n", " \"legend\": {\n", " \"title\": {\n", " \"text\": \"# duplicates\",\n", " \"font\": {\n", " \"size\": 14,\n", " \"weight\": \"bold\",\n", " }\n", " },\n", " \"font\": {\n", " \"size\": 14,\n", " },\n", " \"bgcolor\": 'rgba(255, 255, 255, 0.9)',\n", " # \"borderwidth\": 1,\n", " \"orientation\": \"v\",\n", " \"xanchor\": \"left\",\n", " \"yanchor\": \"bottom\",\n", " \"x\": 0.01,\n", " \"y\": 0,\n", " },\n", " },\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'index'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: 'index'", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[17], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Take the sumarized_df and pivot it\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mindex\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnum_duplicates\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mduplicates_prob\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/frame.py:9326\u001b[0m, in \u001b[0;36mDataFrame.pivot\u001b[0;34m(self, columns, index, values)\u001b[0m\n\u001b[1;32m 9319\u001b[0m \u001b[38;5;129m@Substitution\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 9320\u001b[0m \u001b[38;5;129m@Appender\u001b[39m(_shared_docs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpivot\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 9321\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpivot\u001b[39m(\n\u001b[1;32m 9322\u001b[0m \u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m, columns, index\u001b[38;5;241m=\u001b[39mlib\u001b[38;5;241m.\u001b[39mno_default, values\u001b[38;5;241m=\u001b[39mlib\u001b[38;5;241m.\u001b[39mno_default\n\u001b[1;32m 9323\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[1;32m 9324\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreshape\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpivot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pivot\n\u001b[0;32m-> 9326\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/reshape/pivot.py:553\u001b[0m, in \u001b[0;36mpivot\u001b[0;34m(data, columns, index, values)\u001b[0m\n\u001b[1;32m 549\u001b[0m index_list \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 550\u001b[0m data\u001b[38;5;241m.\u001b[39m_constructor_sliced(data\u001b[38;5;241m.\u001b[39mindex, name\u001b[38;5;241m=\u001b[39mdata\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 551\u001b[0m ]\n\u001b[1;32m 552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 553\u001b[0m index_list \u001b[38;5;241m=\u001b[39m [\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m com\u001b[38;5;241m.\u001b[39mconvert_to_list_like(index)]\n\u001b[1;32m 555\u001b[0m data_columns \u001b[38;5;241m=\u001b[39m [data[col] \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m columns_listlike]\n\u001b[1;32m 556\u001b[0m index_list\u001b[38;5;241m.\u001b[39mextend(data_columns)\n", "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/frame.py:4090\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4089\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4090\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4092\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", "\u001b[0;31mKeyError\u001b[0m: 'index'" ] } ], "source": [ "# Take the sumarized_df and pivotdf" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "plotlyServerURL": "https://plot.ly" }, "data": [ { "name": "1", "type": "bar", "x": [ "1B", "10B", "100B", "350B", "1T" ], "y": [ 0.994974, 0.9515081, 0.60887281, 0.1741474885714285, 0.006232416 ] }, { "name": "2", "type": "bar", "x": [ "1B", "10B", "100B", "350B", "1T" ], "y": [ 0.005008, 0.047331, 0.30282154, 0.3071204342857143, 0.032470074 ] }, { "name": "3", "type": "bar", "x": [ "1B", "10B", "100B", "350B", "1T" ], "y": [ 0.000018, 0.0011439, 0.0745482, 0.2680183371428571, 0.083742993 ] }, { "name": "4-8", "type": "bar", "x": [ "1B", "10B", "100B", "350B", "1T" ], "y": [ 0, 0.000017, 0.01375745, 0.25064894285714273, 0.8176358810000001 ] }, { "name": "8-16", "type": "bar", "x": [ "1B", "10B", "100B", "350B", "1T" ], "y": [ 0, 0, 0, 0.00006479714285714286, 0.05991048400000001 ] }, { "name": "16-32", "type": "bar", "x": [ "1B", "10B", "100B", "350B", "1T" ], "y": [ 0, 0, 0, 0, 0.000008152000000000001 ] } ], "layout": { "barmode": "stack", "legend": { "title": { "text": "# duplicates" } }, "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Sampling from 100 identical buckets with 200B tokens each" }, "xaxis": { "title": { "text": "Sample size" } }, "yaxis": { "range": [ 0, 1.000001 ], "title": { "text": "Dataset fraction" } } } } }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "import plotly.graph_objects as go\n", "\n", "df = pd.read_csv(\"./data/duplicates-simulation.csv\", index_col=0)\n", "\n", "def summarize_ranges(df):\n", " df_summarized = pd.DataFrame(index=['1', '2', '3', '4-8', '8-16', '16-32'], columns=df.columns)\n", " df_summarized.loc['1'] = df.iloc[0]\n", " df_summarized.loc['2'] = df.iloc[1]\n", " df_summarized.loc['3'] = df.iloc[2]\n", " df_summarized.loc['4-8'] = df.iloc[3:9].sum()\n", " df_summarized.loc['8-16'] = df.iloc[9:17].sum()\n", " df_summarized.loc['16-32'] = df.iloc[17:].sum()\n", " return df_summarized\n", "\n", "summarized_df = summarize_ranges(df).T\n", "\n", "# Create a stacked bar chart using Plotly\n", "fig = go.Figure()\n", "for col in summarized_df.columns:\n", " fig.add_trace(go.Bar(\n", " x=summarized_df.index,\n", " y=summarized_df[col],\n", " name=col\n", " ))\n", "\n", "fig.update_layout(\n", " barmode='stack',\n", " title_text=\"Sampling from 100 identical buckets with 200B tokens each\",\n", " xaxis_title=\"Sample size\",\n", " yaxis_title=\"Dataset fraction\",\n", " yaxis=dict(range=[0, 1.000001]),\n", " legend_title=\"# duplicates\",\n", ")\n", "\n", "fig.show()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['1B', '10B', '100B', '350B', '1T'], dtype='object')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summarized_df.index" ] } ], "metadata": { "kernelspec": { "display_name": "datatrove", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }