{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-05-23T13:24:43.346489Z", "start_time": "2024-05-23T13:24:43.080504Z" } }, "source": [ "import pandas as pd" ], "outputs": [], "execution_count": 2 }, { "cell_type": "code", "source": "df = pd.read_csv(\"/home/gui/hf_dev/datatrove/blogpost/data/commoncrawl_new_fixed_dumps.csv\")", "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-23T13:24:43.359405Z", "start_time": "2024-05-23T13:24:43.347650Z" } }, "id": "157e18836c20793c", "outputs": [], "execution_count": 3 }, { "cell_type": "code", "source": [ "grouped = df.groupby('runname')\n", "\n", "# Define a function to take the top 6 rows of each group\n", "def top_6_avg(group):\n", " # Sort the group by \"steps\" in descending order\n", " sorted_group = group.sort_values(by='steps', ascending=False)\n", " # Take the top 6 rows\n", " top_6 = sorted_group.head(6)\n", " # Calculate the average of \"agg_score\"\n", " avg_score = top_6['agg_score'].mean()\n", " return avg_score\n", "\n", "def top_6_stats(group):\n", " # Sort the group by \"steps\" in descending order\n", " sorted_group = group.sort_values(by='steps', ascending=False)\n", " # Take the top 6 rows\n", " top_6 = sorted_group.head(6)\n", " # Calculate the average of \"agg_score\"\n", " avg_score = top_6['agg_score'].mean()\n", " # Calculate the standard deviation of \"agg_score\"\n", " std_dev = top_6['agg_score'].std()\n", " return pd.Series({'avg': avg_score, 'std_dev': std_dev})\n", "\n", "# Apply the function to each group and aggregate the results\n", "result = grouped.apply(top_6_stats)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-23T13:29:12.690882Z", "start_time": "2024-05-23T13:29:12.658225Z" } }, "id": "af7c0416a6371f9a", "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_25844/3603367824.py:25: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", " result = grouped.apply(top_6_stats)\n" ] } ], "execution_count": 18 }, { "cell_type": "code", "source": [ "result" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-23T13:29:17.414733Z", "start_time": "2024-05-23T13:29:17.409686Z" } }, "id": "65c0cd58c6f9f9d6", "outputs": [ { "data": { "text/plain": [ " avg std_dev\n", "runname \n", "2013-20 0.420114 0.001650\n", "2013-48 0.417714 0.002167\n", "2014-10 0.422538 0.002143\n", "2014-15 0.421505 0.002825\n", "2014-23 0.420056 0.001364\n", "... ... ...\n", "2023-14 0.432792 0.004059\n", "2023-23 0.433109 0.001656\n", "2023-40 0.432743 0.003919\n", "2023-50 0.433199 0.001750\n", "2024-10 0.435825 0.001774\n", "\n", "[95 rows x 2 columns]" ], "text/html": [ "
\n", " | avg | \n", "std_dev | \n", "
---|---|---|
runname | \n", "\n", " | \n", " |
2013-20 | \n", "0.420114 | \n", "0.001650 | \n", "
2013-48 | \n", "0.417714 | \n", "0.002167 | \n", "
2014-10 | \n", "0.422538 | \n", "0.002143 | \n", "
2014-15 | \n", "0.421505 | \n", "0.002825 | \n", "
2014-23 | \n", "0.420056 | \n", "0.001364 | \n", "
... | \n", "... | \n", "... | \n", "
2023-14 | \n", "0.432792 | \n", "0.004059 | \n", "
2023-23 | \n", "0.433109 | \n", "0.001656 | \n", "
2023-40 | \n", "0.432743 | \n", "0.003919 | \n", "
2023-50 | \n", "0.433199 | \n", "0.001750 | \n", "
2024-10 | \n", "0.435825 | \n", "0.001774 | \n", "
95 rows × 2 columns
\n", "