{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "138889b92720ce2e", "metadata": { "ExecuteTime": { "end_time": "2024-04-30T15:08:02.398435Z", "start_time": "2024-04-30T15:08:02.194901Z" }, "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
runnameseedstepsagg_scorecommonsense_qa/acccommonsense_qa/acc_normhellaswag/acchellaswag/acc_normopenbookqa/accopenbookqa/acc_norm...siqa/accsiqa/acc_normwinogrande/accwinogrande/acc_normsciq/accsciq/acc_normarc/accarc/acc_normmmlu/accmmlu/acc_norm
0big-run-refinedweb600.3308930.1860.2330.2720.2580.1660.286...0.3670.3620.5160.4970.2080.2020.21950.25100.2302940.250147
1big-run-refinedweb610000.3534810.2330.2530.2880.2760.1200.256...0.3650.3980.5020.5000.5820.5280.26500.29000.2405830.252852
2big-run-refinedweb620000.3764610.2820.2800.3150.3280.1540.284...0.3680.3900.5110.4980.6830.5900.30550.31700.2450670.261686
3big-run-refinedweb630000.3878250.2820.2870.3310.3500.1520.306...0.3760.3860.5120.4950.7480.6460.32100.34100.2502680.266600
4big-run-refinedweb640000.3981050.3100.3180.3400.3890.1680.306...0.3710.3920.5130.4950.7360.6340.33050.34250.2507320.268341
..................................................................
1339big-run-url_dedups_lowercase_char_length61630000.4776940.3960.3750.4770.5780.2260.354...0.4080.4150.5620.5480.8790.8170.46550.45400.3036720.325554
1340big-run-url_dedups_lowercase_char_length61640000.4765910.3960.3750.4780.5810.2280.342...0.4170.4140.5550.5440.8830.8270.46000.45700.3064060.329724
1341big-run-url_dedups_lowercase_char_length61650000.4789640.4050.3880.4740.5830.2300.362...0.4140.4120.5620.5410.8810.8260.45450.44650.3041210.327213
1342big-run-url_dedups_lowercase_char_length61660000.4774670.3980.3810.4700.5790.2340.354...0.4130.4110.5540.5440.8870.8310.46250.45650.3058550.328240
1343big-run-url_dedups_lowercase_char_length61670000.4766300.3980.3700.4770.5770.2440.354...0.4130.4140.5530.5400.8790.8250.46600.45650.3079400.328538
\n", "

1344 rows × 22 columns

\n", "
" ], "text/plain": [ " runname seed steps agg_score \\\n", "0 big-run-refinedweb 6 0 0.330893 \n", "1 big-run-refinedweb 6 1000 0.353481 \n", "2 big-run-refinedweb 6 2000 0.376461 \n", "3 big-run-refinedweb 6 3000 0.387825 \n", "4 big-run-refinedweb 6 4000 0.398105 \n", "... ... ... ... ... \n", "1339 big-run-url_dedups_lowercase_char_length 6 163000 0.477694 \n", "1340 big-run-url_dedups_lowercase_char_length 6 164000 0.476591 \n", "1341 big-run-url_dedups_lowercase_char_length 6 165000 0.478964 \n", "1342 big-run-url_dedups_lowercase_char_length 6 166000 0.477467 \n", "1343 big-run-url_dedups_lowercase_char_length 6 167000 0.476630 \n", "\n", " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n", "0 0.186 0.233 0.272 \n", "1 0.233 0.253 0.288 \n", "2 0.282 0.280 0.315 \n", "3 0.282 0.287 0.331 \n", "4 0.310 0.318 0.340 \n", "... ... ... ... \n", "1339 0.396 0.375 0.477 \n", "1340 0.396 0.375 0.478 \n", "1341 0.405 0.388 0.474 \n", "1342 0.398 0.381 0.470 \n", "1343 0.398 0.370 0.477 \n", "\n", " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n", "0 0.258 0.166 0.286 ... 0.367 \n", "1 0.276 0.120 0.256 ... 0.365 \n", "2 0.328 0.154 0.284 ... 0.368 \n", "3 0.350 0.152 0.306 ... 0.376 \n", "4 0.389 0.168 0.306 ... 0.371 \n", "... ... ... ... ... ... \n", "1339 0.578 0.226 0.354 ... 0.408 \n", "1340 0.581 0.228 0.342 ... 0.417 \n", "1341 0.583 0.230 0.362 ... 0.414 \n", "1342 0.579 0.234 0.354 ... 0.413 \n", "1343 0.577 0.244 0.354 ... 0.413 \n", "\n", " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n", "0 0.362 0.516 0.497 0.208 \n", "1 0.398 0.502 0.500 0.582 \n", "2 0.390 0.511 0.498 0.683 \n", "3 0.386 0.512 0.495 0.748 \n", "4 0.392 0.513 0.495 0.736 \n", "... ... ... ... ... \n", "1339 0.415 0.562 0.548 0.879 \n", "1340 0.414 0.555 0.544 0.883 \n", "1341 0.412 0.562 0.541 0.881 \n", "1342 0.411 0.554 0.544 0.887 \n", "1343 0.414 0.553 0.540 0.879 \n", "\n", " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n", "0 0.202 0.2195 0.2510 0.230294 0.250147 \n", "1 0.528 0.2650 0.2900 0.240583 0.252852 \n", "2 0.590 0.3055 0.3170 0.245067 0.261686 \n", "3 0.646 0.3210 0.3410 0.250268 0.266600 \n", "4 0.634 0.3305 0.3425 0.250732 0.268341 \n", "... ... ... ... ... ... \n", "1339 0.817 0.4655 0.4540 0.303672 0.325554 \n", "1340 0.827 0.4600 0.4570 0.306406 0.329724 \n", "1341 0.826 0.4545 0.4465 0.304121 0.327213 \n", "1342 0.831 0.4625 0.4565 0.305855 0.328240 \n", "1343 0.825 0.4660 0.4565 0.307940 0.328538 \n", "\n", "[1344 rows x 22 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "from matplotlib.figure import Figure\n", "\n", "df = pd.read_csv(\"../src_data/diff_dedup_attempts.csv\")\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "id": "874ab88a573cd443", "metadata": { "ExecuteTime": { "end_time": "2024-05-13T13:56:19.453420Z", "start_time": "2024-05-13T13:56:19.450850Z" } }, "outputs": [ { "data": { "text/plain": [ "['big-run-refinedweb',\n", " 'big-run-sampled_cross_minhash_dump',\n", " 'big-run-sampled_full_filtered_no_dedup',\n", " 'big-run-sampled_full_imh_linededup',\n", " 'big-run-sampled_full_ind_minhash',\n", " 'big-run-sampled_line_dedup_3lines2',\n", " 'big-run-sampled_line_dedup_min_words',\n", " 'big-run-url_dedups_lowercase_char_length']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.unique(df[\"runname\"]).tolist()" ] }, { "cell_type": "code", "execution_count": 4, "id": "b610f43caefdf01", "metadata": { "ExecuteTime": { "end_time": "2024-05-13T14:00:46.578560Z", "start_time": "2024-05-13T14:00:46.576167Z" }, "collapsed": false }, "outputs": [], "source": [ "runs_mapping = {\n", " \"big-run-refinedweb\": \"RefinedWeb\",\n", " \"big-run-sampled_cross_minhash_dump\": \"FineWeb full MinHash\",\n", " \"big-run-sampled_full_filtered_no_dedup\": \"FineWeb filtered only\",\n", " \"big-run-sampled_full_ind_minhash\": \"FineWeb independent MinHash\",\n", " \"big-run-sampled_full_imh_linededup\": \"FineWeb line dedup\",\n", " \"big-run-sampled_line_dedup_3lines2\": \"FineWeb 3-line dedup\",\n", " \"big-run-sampled_line_dedup_min_words\": \"FineWeb line dedup w/ min words\",\n", " \"big-run-url_dedups_lowercase_char_length\": \"FineWeb URL dedup\"\n", "}" ] }, { "cell_type": "code", "execution_count": 5, "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2024-05-13T14:04:41.777032Z", "start_time": "2024-05-13T14:04:41.536919Z" }, "collapsed": true }, "outputs": [], "source": [ "import json\n", "import os\n", "from matplotlib import pyplot as plt\n", "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n", " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n", "\n", "def normalize_runname(runname):\n", " return runname.replace(\"/\", \"_\")\n", "\n", "grouped = (\n", " df.groupby([\"runname\", \"steps\"])\n", " .agg(\n", " {\n", " key: \"mean\" for key in metrics\n", " }\n", " )\n", " .reset_index()\n", ")\n", "\n", "file_id=\"../assets/data/plots/dedup_attempts\"\n", "files = {}\n", "for metric in metrics:\n", " datas = {}\n", " for name, group in grouped.groupby(\"runname\"):\n", " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n", " group = group.set_index(\"steps\")\n", " rolling_avg = group\n", " # rolling_avg = group.rolling(wjjjjjjjjjjjjjindow=5).mean()\n", " datas[name] = {\n", " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n", " \"y\": rolling_avg[metric].tolist(),\n", " \"label\": runs_mapping[name],\n", " }\n", " # Sort the datata based on the steps\n", " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n", " # Create a folder\n", " os.makedirs(f\"{file_id}\", exist_ok=True)\n", " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n", " json.dump({\n", " \"data\": datas,\n", " \"layout\": {\n", " \"title\": {\n", " \"text\": \"Attempting to further globally dedup worsened perf\"\n", " },\n", " }\n", " }, f)\n", " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n", "# Create index\n", "with open(f\"{file_id}/index.json\", \"w\") as f:\n", " json.dump({\n", " \"files\": files,\n", " \"settings\": {\n", " \"defaultMetric\": \"agg_score\",\n", " \"slider\":{\"min\":0,\"max\":30,\"default\":5}\n", " }\n", " }, f)\n", " \n", " " ] }, { "cell_type": "code", "execution_count": 4, "id": "af28ebbd054cdc33", "metadata": { "ExecuteTime": { "end_time": "2024-04-30T15:08:02.522543Z", "start_time": "2024-04-30T15:08:02.520569Z" }, "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }