{
"cells": [
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"\n",
"def normalize_run_name(run_name):\n",
" return run_name.replace(\"/\", \"_\")\n",
"\n",
"def save_for_plot(dir_name, df, run_names, xlabel=\"Dataset\", ylabel=\"Matched as dups probability\", plot_name=\"plot name\", custom_layout={}, ranges={}, x_column=None, default_metric=None):\n",
" import os\n",
" files = {}\n",
" os.makedirs(f\"data/plots/{dir_name}\", exist_ok=True)\n",
" data = {}\n",
" for run_name in run_names:\n",
" data[run_name] = {\n",
" \"x\": df[x_column].tolist() if x_column else [run_name],\n",
" \"y\": df[run_name].tolist(),\n",
" \"label\": run_name,\n",
" }\n",
" file_name = f\"default.json\"\n",
" files[\"default\"] = {\"file\": f\"{file_name}\"}\n",
" with open(f\"data/plots/{dir_name}/{file_name}\", \"w\") as f:\n",
" json.dump({\n",
" \"data\": data,\n",
" \"layout\": {\n",
" \"title\": {\n",
" \"text\": plot_name,\n",
" },\n",
" \"xaxis\": {\n",
" \"title\": {\n",
" \"text\": xlabel,\n",
" },\n",
" },\n",
" \"yaxis\": {\n",
" # \"range\": ranges.get(view, None),\n",
" \"title\": {\n",
" \"text\": ylabel,\n",
" },\n",
" },\n",
" **custom_layout,\n",
" }\n",
" }, f)\n",
" with open(f\"data/plots/{dir_name}/index.json\", \"w\") as f:\n",
" json.dump({\n",
" \"files\": files,\n",
" \"settings\": {\n",
" \"defaultMetric\": default_metric,\n",
" \"slider\": None,\n",
" \"autoSetXRange\": False,\n",
" \"type\": \"bar\"\n",
" }\n",
" }, f)\n",
" return files\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4-8 | \n",
" 8-16 | \n",
" 16-32 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1B | \n",
" 0.994974 | \n",
" 0.005008 | \n",
" 0.000018 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 10B | \n",
" 0.951508 | \n",
" 0.047331 | \n",
" 0.001144 | \n",
" 0.000017 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 100B | \n",
" 0.608873 | \n",
" 0.302822 | \n",
" 0.074548 | \n",
" 0.013757 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 350B | \n",
" 0.174147 | \n",
" 0.30712 | \n",
" 0.268018 | \n",
" 0.250649 | \n",
" 0.000065 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1T | \n",
" 0.006232 | \n",
" 0.03247 | \n",
" 0.083743 | \n",
" 0.817636 | \n",
" 0.05991 | \n",
" 0.000008 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 1 2 3 4-8 8-16 16-32\n",
"1B 0.994974 0.005008 0.000018 0.0 0.0 0.0\n",
"10B 0.951508 0.047331 0.001144 0.000017 0.0 0.0\n",
"100B 0.608873 0.302822 0.074548 0.013757 0.0 0.0\n",
"350B 0.174147 0.30712 0.268018 0.250649 0.000065 0.0\n",
"1T 0.006232 0.03247 0.083743 0.817636 0.05991 0.000008"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4-8 | \n",
" 8-16 | \n",
" 16-32 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1B | \n",
" 0.994974 | \n",
" 0.005008 | \n",
" 0.000018 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 10B | \n",
" 0.951508 | \n",
" 0.047331 | \n",
" 0.001144 | \n",
" 0.000017 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 100B | \n",
" 0.608873 | \n",
" 0.302822 | \n",
" 0.074548 | \n",
" 0.013757 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 350B | \n",
" 0.174147 | \n",
" 0.30712 | \n",
" 0.268018 | \n",
" 0.250649 | \n",
" 0.000065 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 1T | \n",
" 0.006232 | \n",
" 0.03247 | \n",
" 0.083743 | \n",
" 0.817636 | \n",
" 0.05991 | \n",
" 0.000008 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index 1 2 3 4-8 8-16 16-32\n",
"0 1B 0.994974 0.005008 0.000018 0.0 0.0 0.0\n",
"1 10B 0.951508 0.047331 0.001144 0.000017 0.0 0.0\n",
"2 100B 0.608873 0.302822 0.074548 0.013757 0.0 0.0\n",
"3 350B 0.174147 0.30712 0.268018 0.250649 0.000065 0.0\n",
"4 1T 0.006232 0.03247 0.083743 0.817636 0.05991 0.000008"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summarized_df.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'default': {'file': 'default.json'}}"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"\n",
"df = pd.read_csv(\"./data/duplicates-simulation.csv\", index_col=0)\n",
"\n",
"\n",
"def summarize_ranges(df):\n",
" df_summarized = pd.DataFrame(\n",
" index=[\"1\", \"2\", \"3\", \"4-8\", \"8-16\", \"16-32\"], columns=df.columns\n",
" )\n",
" df_summarized.loc[\"1\"] = df.iloc[0]\n",
" df_summarized.loc[\"2\"] = df.iloc[1]\n",
" df_summarized.loc[\"3\"] = df.iloc[2]\n",
" df_summarized.loc[\"4-8\"] = df.iloc[3:9].sum()\n",
" df_summarized.loc[\"8-16\"] = df.iloc[9:17].sum()\n",
" df_summarized.loc[\"16-32\"] = df.iloc[17:].sum()\n",
" return df_summarized\n",
"\n",
"\n",
"summarized_df = summarize_ranges(df).T\n",
"cols = summarized_df.columns\n",
"summarized_df.reset_index(inplace=True)\n",
"save_for_plot(\n",
" \"duplicates-simul\",\n",
" summarized_df,\n",
" cols,\n",
" x_column=\"index\",\n",
" plot_name=\"Sampling from 1000 identical buckets with 200B tokens each\",\n",
" ylabel=\"Dataset fraction\",\n",
" xlabel=\"Sample size\",\n",
" default_metric=\"default\",\n",
" custom_layout={\n",
" \"barmode\": \"stack\",\n",
" \"legend\": {\n",
" \"title\": {\n",
" \"text\": \"# duplicates\",\n",
" \"font\": {\n",
" \"size\": 14,\n",
" \"weight\": \"bold\",\n",
" }\n",
" },\n",
" \"font\": {\n",
" \"size\": 14,\n",
" },\n",
" \"bgcolor\": 'rgba(255, 255, 255, 0.9)',\n",
" # \"borderwidth\": 1,\n",
" \"orientation\": \"v\",\n",
" \"xanchor\": \"left\",\n",
" \"yanchor\": \"bottom\",\n",
" \"x\": 0.01,\n",
" \"y\": 0,\n",
" },\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'index'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
"File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
"File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 'index'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[17], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Take the sumarized_df and pivot it\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mindex\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnum_duplicates\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mduplicates_prob\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/frame.py:9326\u001b[0m, in \u001b[0;36mDataFrame.pivot\u001b[0;34m(self, columns, index, values)\u001b[0m\n\u001b[1;32m 9319\u001b[0m \u001b[38;5;129m@Substitution\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 9320\u001b[0m \u001b[38;5;129m@Appender\u001b[39m(_shared_docs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpivot\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 9321\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpivot\u001b[39m(\n\u001b[1;32m 9322\u001b[0m \u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m, columns, index\u001b[38;5;241m=\u001b[39mlib\u001b[38;5;241m.\u001b[39mno_default, values\u001b[38;5;241m=\u001b[39mlib\u001b[38;5;241m.\u001b[39mno_default\n\u001b[1;32m 9323\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[1;32m 9324\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreshape\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpivot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pivot\n\u001b[0;32m-> 9326\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/reshape/pivot.py:553\u001b[0m, in \u001b[0;36mpivot\u001b[0;34m(data, columns, index, values)\u001b[0m\n\u001b[1;32m 549\u001b[0m index_list \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 550\u001b[0m data\u001b[38;5;241m.\u001b[39m_constructor_sliced(data\u001b[38;5;241m.\u001b[39mindex, name\u001b[38;5;241m=\u001b[39mdata\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 551\u001b[0m ]\n\u001b[1;32m 552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 553\u001b[0m index_list \u001b[38;5;241m=\u001b[39m [\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m com\u001b[38;5;241m.\u001b[39mconvert_to_list_like(index)]\n\u001b[1;32m 555\u001b[0m data_columns \u001b[38;5;241m=\u001b[39m [data[col] \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m columns_listlike]\n\u001b[1;32m 556\u001b[0m index_list\u001b[38;5;241m.\u001b[39mextend(data_columns)\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/frame.py:4090\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4089\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4090\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4092\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
"File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
"\u001b[0;31mKeyError\u001b[0m: 'index'"
]
}
],
"source": [
"# Take the sumarized_df and pivotdf"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"name": "1",
"type": "bar",
"x": [
"1B",
"10B",
"100B",
"350B",
"1T"
],
"y": [
0.994974,
0.9515081,
0.60887281,
0.1741474885714285,
0.006232416
]
},
{
"name": "2",
"type": "bar",
"x": [
"1B",
"10B",
"100B",
"350B",
"1T"
],
"y": [
0.005008,
0.047331,
0.30282154,
0.3071204342857143,
0.032470074
]
},
{
"name": "3",
"type": "bar",
"x": [
"1B",
"10B",
"100B",
"350B",
"1T"
],
"y": [
0.000018,
0.0011439,
0.0745482,
0.2680183371428571,
0.083742993
]
},
{
"name": "4-8",
"type": "bar",
"x": [
"1B",
"10B",
"100B",
"350B",
"1T"
],
"y": [
0,
0.000017,
0.01375745,
0.25064894285714273,
0.8176358810000001
]
},
{
"name": "8-16",
"type": "bar",
"x": [
"1B",
"10B",
"100B",
"350B",
"1T"
],
"y": [
0,
0,
0,
0.00006479714285714286,
0.05991048400000001
]
},
{
"name": "16-32",
"type": "bar",
"x": [
"1B",
"10B",
"100B",
"350B",
"1T"
],
"y": [
0,
0,
0,
0,
0.000008152000000000001
]
}
],
"layout": {
"barmode": "stack",
"legend": {
"title": {
"text": "# duplicates"
}
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Sampling from 100 identical buckets with 200B tokens each"
},
"xaxis": {
"title": {
"text": "Sample size"
}
},
"yaxis": {
"range": [
0,
1.000001
],
"title": {
"text": "Dataset fraction"
}
}
}
}
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import plotly.graph_objects as go\n",
"\n",
"df = pd.read_csv(\"./data/duplicates-simulation.csv\", index_col=0)\n",
"\n",
"def summarize_ranges(df):\n",
" df_summarized = pd.DataFrame(index=['1', '2', '3', '4-8', '8-16', '16-32'], columns=df.columns)\n",
" df_summarized.loc['1'] = df.iloc[0]\n",
" df_summarized.loc['2'] = df.iloc[1]\n",
" df_summarized.loc['3'] = df.iloc[2]\n",
" df_summarized.loc['4-8'] = df.iloc[3:9].sum()\n",
" df_summarized.loc['8-16'] = df.iloc[9:17].sum()\n",
" df_summarized.loc['16-32'] = df.iloc[17:].sum()\n",
" return df_summarized\n",
"\n",
"summarized_df = summarize_ranges(df).T\n",
"\n",
"# Create a stacked bar chart using Plotly\n",
"fig = go.Figure()\n",
"for col in summarized_df.columns:\n",
" fig.add_trace(go.Bar(\n",
" x=summarized_df.index,\n",
" y=summarized_df[col],\n",
" name=col\n",
" ))\n",
"\n",
"fig.update_layout(\n",
" barmode='stack',\n",
" title_text=\"Sampling from 100 identical buckets with 200B tokens each\",\n",
" xaxis_title=\"Sample size\",\n",
" yaxis_title=\"Dataset fraction\",\n",
" yaxis=dict(range=[0, 1.000001]),\n",
" legend_title=\"# duplicates\",\n",
")\n",
"\n",
"fig.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['1B', '10B', '100B', '350B', '1T'], dtype='object')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summarized_df.index"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "datatrove",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}