## Fetch the data from the hub

In [1]:
import os
import itertools
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import itertools
import huggingface_hub
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
from huggingface_hub.utils import EntryNotFoundError

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def step_element_match(step_to_check, step_element):
    step_element = step_element.strip().replace(" ", "")
    if "-" in step_element:
        a, b = step_element.split("-")
        c = None
        if "%" in b:
            b, c = b.split("%")
        return (int(a) <= step_to_check <= int(b) and
                (c is None or (step_to_check - int(a)) % int(c) == 0))
    elif "%" in step_element:
        return step_to_check % int(step_element[1:]) == 0
    else:
        return step_to_check == int(step_element)
    
def fetch_run_results_simple(repo_name, runs_to_fetch, steps_to_fetch, prefix, agg_score_columns, column_name,
                             seed_merge_method, oauth_token=None, prefix_file=None):
    if not runs_to_fetch:
        return

    def fetch_run_files(run_to_fetch):
        def filename_to_steps_timestamp(fn):
            step, ts = fn.split("_events.out.tfevents.")
            return int(step[-7:]), int(ts[:ts.index(".")])

        run_to_fetch += "_e"
        try:
            eval_repo_file_names = [f.path for f in
                                    huggingface_hub.list_repo_tree(repo_name, run_to_fetch, expand=False,
                                                                   token=oauth_token) if
                                    "_events.out.tfevents" in f.path]
        except EntryNotFoundError:
            return []

        eval_files = [os.path.relpath(f, run_to_fetch) for f in eval_repo_file_names]
        timestamps = {}
        for fn in eval_files:
            steps, ts = filename_to_steps_timestamp(fn)
            if steps not in timestamps or timestamps[steps][0] < ts:
                timestamps[steps] = ts, fn

        results = []
        for eval_file, repofile in zip(eval_files, eval_repo_file_names):
            steps, ts = filename_to_steps_timestamp(eval_file)
            if not any(step_element_match(steps, step_el) for step_el in steps_to_fetch.split(",")):
                continue
            if timestamps[steps][1] == eval_file:
                results.append((run_to_fetch, steps, repofile))
        return results

    def load_run_file(data):
        run_to_fetch, steps, repofile = data
        loader = EventAccumulator(huggingface_hub.hf_hub_download(repo_name, repofile, token=oauth_token))
        loader.Reload()
        runname = run_to_fetch.removeprefix(prefix).removesuffix("-_e")
        column_names = ["runname", "seed", "steps", "agg_score"]
        column_values = [runname, 0, steps, 0.0]

        for tag in loader.Tags()['scalars']:
            if not "stderr" in tag and tag.split('/')[0] == 'e':
                event_list = loader.Scalars(tag)
                tag = tag.split('/')
                column_names.append(f"{tag[1]}/{tag[2]}")
                column_values.append(event_list[-1].value)

        return pd.DataFrame([column_values], columns=column_names)

    with ThreadPoolExecutor() as pool:
        run_files = list(itertools.chain.from_iterable(
            tqdm(pool.map(fetch_run_files, runs_to_fetch), total=len(runs_to_fetch), desc="Fetching datafiles...")))
        df = pd.concat(tqdm(pool.map(load_run_file, run_files), total=len(run_files), desc="Loading evals data..."))

    cols_to_avg = [col for col in agg_score_columns if col in df.columns]
    if cols_to_avg:
        df['agg_score'] = df[cols_to_avg].mean(axis=1)

    prefix_file = prefix_file + "_" if prefix_file else ""
    df.to_csv(f"{prefix_file}{repo_name.split('/')[-1]}_metrics.csv", index=False)
    print(f"Metrics saved to {repo_name.split('/')[-1]}_metrics.csv")

    return df

In [48]:
token = os.getenv("HF_TOKEN")
repo_name = "HuggingFaceTB/loubna-edu_fw_ablations"
runs_to_fetch = ["tb/edu_fw_ablations-1p82G-edu_fineweb_350b_tokens-seed-1-"]
steps_to_fetch = "%1000"
prefix = "tb/edu_fw_ablations-1p82G-"
metrics = ['commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',
                   'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']
agg_score_columns = metrics
column_name = "agg_score"
seed_merge_method = "mean"
oauth_token = token

# runs_to_fetch = [prefix + run for run in runs_to_fetch]
fetch_run_results_simple(repo_name, runs_to_fetch, steps_to_fetch, prefix, agg_score_columns, column_name, seed_merge_method, oauth_token=token)

Fetching datafiles...: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:02<00:00,  2.94s/it]
Loading evals data...: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 82/82 [00:15<00:00,  5.37it/s]

Metrics saved to loubna-edu_fw_ablations_metrics.csv





Unnamed: 0,runname,seed,steps,agg_score,commonsense_qa/acc,commonsense_qa/acc_norm,hellaswag/acc,hellaswag/acc_norm,openbookqa/acc,openbookqa/acc_norm,...,siqa/acc,siqa/acc_norm,winogrande/acc,winogrande/acc_norm,all/acc,all/acc_norm,arc/acc,arc/acc_norm,mmlu/acc,mmlu/acc_norm
0,edu_fineweb_350b_tokens-seed-1,0,2000,0.390326,0.284,0.283,0.314,0.325,0.164,0.296,...,0.362,0.406,0.511,0.511,0.279674,0.299162,0.3795,0.3850,0.265997,0.284605
0,edu_fineweb_350b_tokens-seed-1,0,4000,0.414680,0.322,0.307,0.343,0.395,0.196,0.320,...,0.371,0.388,0.518,0.495,0.290613,0.312593,0.4215,0.4285,0.274401,0.295939
0,edu_fineweb_350b_tokens-seed-1,0,6000,0.428390,0.319,0.311,0.372,0.431,0.202,0.352,...,0.373,0.392,0.520,0.519,0.303980,0.323323,0.4315,0.4460,0.288591,0.306123
0,edu_fineweb_350b_tokens-seed-1,0,8000,0.443615,0.340,0.311,0.379,0.463,0.204,0.360,...,0.384,0.404,0.517,0.517,0.315148,0.333284,0.4630,0.4790,0.299186,0.314921
0,edu_fineweb_350b_tokens-seed-1,0,10000,0.441457,0.346,0.317,0.390,0.454,0.222,0.364,...,0.366,0.395,0.514,0.506,0.318935,0.335419,0.4890,0.4820,0.302189,0.317653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,edu_fineweb_350b_tokens-seed-1,0,160000,0.507129,0.430,0.359,0.473,0.593,0.282,0.418,...,0.392,0.402,0.576,0.575,0.369137,0.393898,0.5670,0.5725,0.350226,0.374533
0,edu_fineweb_350b_tokens-seed-1,0,162000,0.509118,0.416,0.367,0.474,0.592,0.288,0.408,...,0.390,0.409,0.572,0.577,0.367420,0.392861,0.5720,0.5780,0.348268,0.372947
0,edu_fineweb_350b_tokens-seed-1,0,164000,0.507843,0.416,0.365,0.467,0.591,0.276,0.408,...,0.395,0.406,0.576,0.580,0.368319,0.392000,0.5635,0.5715,0.349943,0.372246
0,edu_fineweb_350b_tokens-seed-1,0,166000,0.508308,0.415,0.364,0.472,0.593,0.282,0.414,...,0.401,0.408,0.575,0.570,0.370593,0.393176,0.5640,0.5760,0.352203,0.373463


## Plot the data

### Load csvs for FW and FW-Edu

In [14]:
import pandas as pd

# load guilherme csv with all the FW runs
df = pd.read_csv("../src_data/eval_results.csv")

# load FineWeb-Edu csv
df_2 = pd.read_csv("./loubna-edu_fw_ablations_metrics.csv")
df_2['runname'] = df_2['runname'].replace('edu_fineweb_350b_tokens-seed-1', 'FineWeb-Edu', regex=True)
df_2.head()

Unnamed: 0,runname,seed,steps,agg_score,commonsense_qa/acc,commonsense_qa/acc_norm,hellaswag/acc,hellaswag/acc_norm,openbookqa/acc,openbookqa/acc_norm,...,siqa/acc,siqa/acc_norm,winogrande/acc,winogrande/acc_norm,all/acc,all/acc_norm,arc/acc,arc/acc_norm,mmlu/acc,mmlu/acc_norm
0,FineWeb-Edu,0,2000,0.390326,0.284,0.283,0.314,0.325,0.164,0.296,...,0.362,0.406,0.511,0.511,0.279674,0.299162,0.3795,0.385,0.265997,0.284605
1,FineWeb-Edu,0,4000,0.41468,0.322,0.307,0.343,0.395,0.196,0.32,...,0.371,0.388,0.518,0.495,0.290613,0.312593,0.4215,0.4285,0.274401,0.295939
2,FineWeb-Edu,0,6000,0.42839,0.319,0.311,0.372,0.431,0.202,0.352,...,0.373,0.392,0.52,0.519,0.30398,0.323323,0.4315,0.446,0.288591,0.306123
3,FineWeb-Edu,0,8000,0.443615,0.34,0.311,0.379,0.463,0.204,0.36,...,0.384,0.404,0.517,0.517,0.315148,0.333284,0.463,0.479,0.299186,0.314921
4,FineWeb-Edu,0,10000,0.441457,0.346,0.317,0.39,0.454,0.222,0.364,...,0.366,0.395,0.514,0.506,0.318935,0.335419,0.489,0.482,0.302189,0.317653


In [15]:
df_full = pd.concat([df, df_2], ignore_index=True)
df_full.tail()

Unnamed: 0,runname,steps,agg_score,commonsense_qa/acc,commonsense_qa/acc_norm,hellaswag/acc,hellaswag/acc_norm,openbookqa/acc,openbookqa/acc_norm,piqa/acc,...,winogrande/acc_norm,sciq/acc,sciq/acc_norm,arc/acc,arc/acc_norm,mmlu/acc,mmlu/acc_norm,seed,all/acc,all/acc_norm
1253,FineWeb-Edu,160000,0.507129,0.43,0.359,0.473,0.593,0.282,0.418,0.744,...,0.575,,,0.567,0.5725,0.350226,0.374533,0.0,0.369137,0.393898
1254,FineWeb-Edu,162000,0.509118,0.416,0.367,0.474,0.592,0.288,0.408,0.747,...,0.577,,,0.572,0.578,0.348268,0.372947,0.0,0.36742,0.392861
1255,FineWeb-Edu,164000,0.507843,0.416,0.365,0.467,0.591,0.276,0.408,0.737,...,0.58,,,0.5635,0.5715,0.349943,0.372246,0.0,0.368319,0.392
1256,FineWeb-Edu,166000,0.508308,0.415,0.364,0.472,0.593,0.282,0.414,0.74,...,0.57,,,0.564,0.576,0.352203,0.373463,0.0,0.370593,0.393176
1257,FineWeb-Edu,167000,0.509494,0.429,0.362,0.472,0.597,0.29,0.418,0.738,...,0.578,,,0.567,0.5735,0.350671,0.374453,0.0,0.369666,0.394136


### Guilherme-Board plot

In [5]:
df_full.groupby("runname").agg({"steps": "count"})

Unnamed: 0_level_0,steps
runname,Unnamed: 1_level_1
C4,168
Dolma,168
FineWeb (ours),168
FineWeb-Edu,82
RedPajama2,168
RefinedWeb,168
SlimPajama,168
The Pile,168


In [6]:
fineweb_edu_steps = df_full[df_full["runname"] == "FineWeb-Edu"]["steps"].unique()
# Only selects steps that are in the fineweb_edu_steps  
df_full = df_full[df_full["steps"].isin(fineweb_edu_steps)]

In [13]:
import os
import json
from matplotlib import pyplot as plt
metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',
                   'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']

def normalize_runname(runname):
    return runname.replace("/", "_")

grouped = (
    df_full.groupby(["runname", "steps"])
    .agg(
        {
            key: "mean" for key in metrics
        }
    )
    .reset_index()
)

file_id="../assets/data/plots/edu_ablations"
files = {}
for metric in metrics:
    datas = {}
    for name, group in grouped.groupby("runname"):
        group = group[["steps", metric]].sort_values(by="steps")
        group = group.set_index("steps")
        rolling_avg = group
        # rolling_avg = group.rolling(window=5).mean()
        datas[name] = {
            "x": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),
            "y": rolling_avg[metric].tolist(),
            "label": name,
        }
    # Sort the datata based on the steps
    datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1]["y"][-1])}
    # Create a folder
    os.makedirs(f"{file_id}", exist_ok=True)
    with open(f"{file_id}/{normalize_runname(metric)}.json", "w") as f:
        json.dump({
            "data": datas,
            "layout": {
                "title": {
                    "text": "Dataset ablations"
                },
            }
        }, f)
    files[metric] = {"file": f"{normalize_runname(metric)}.json"}
# Create index
with open(f"{file_id}/index.json", "w") as f:
    json.dump({
        "files": files,
        "settings": {
            "defaultMetric": "agg_score",
            "slider":{"min":0,"max":30,"default":5},
            "caption": "üìö FineWeb-Edu outperforms üç∑ FineWeb and all other open web datasets on our group of evaluation tasks."
        }
    }, f)

### Barplot

In [None]:
!pip install -U kaleido



In [None]:
%load_ext autoreload
%autoreload 2

In [10]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import json

BASELINES = {
    "mmlu/acc_norm": 0.25,
    "arc/acc_norm": 0.25,
    "openbookqa/acc_norm": 0.25,
    "piqa/acc_norm": 0.5,
    "hellaswag/acc_norm": 0.25,
    "siqa/acc_norm": 0.33,
    "winogrande/acc_norm": 0.5,
}


def normalize_run_name(run_name):
    return run_name.replace("/", "_")


def save_for_bar(dir_name, df, metrics, default_metric="mmlu/acc_norm", xlabel="Dataset", plot_name="plot name", custom_layout={}, ranges={}):
    import os
    files = {}
    os.makedirs(f"../assets/data/plots/{dir_name}", exist_ok=True)
    for metric in metrics:
        data = {}
        for run_name in df["runname"].unique():
            data[run_name] = {
                "x": [run_name],
                "y": df[df["runname"] == run_name][metric].tolist(),
                "label": run_name,
            }
        file_name = f"{normalize_run_name(metric)}.json"
        files[metric] = {"file": f"{file_name}"}
        with open(f"../assets/data/plots/{dir_name}/{file_name}", "w") as f:
            json.dump({
                "data": data,
                "layout": {
                    "showlegend": False,
                    "title": {
                        "text": plot_name,
                    },
                    "xaxis": {
                        "title": {
                            "text": xlabel,
                            "standoff": 30
                        },
                        "tickangle": 30
                    },
                    "yaxis": {
                        "range": ranges.get(metric, [0, 1])
                    },
                    "margin": {
                        "b": 100
                    },
                    **custom_layout,
                }
            }, f)
    with open(f"../assets/data/plots/{dir_name}/index.json", "w") as f:
        json.dump({
            "files": files,
            "settings": {
                "defaultMetric": default_metric,
                "slider": None,
                "autoSetXRange": False,
                "type": "bar"
            }
        }, f)
    return files

def plot_metric_comparison(df, step, metrics, plot_name, run_name_replacements=None, output_file='comparison_plot_percentages.png', default_metric="mmlu/acc_norm", custom_layout={}):
    """
    Plot a comparison of the given metrics across different runs at the specified step and save the plot.
    """
    if run_name_replacements:
        df['runname'] = df['runname'].replace(run_name_replacements)

    df_filtered = df[df['steps'] == step]

    # Create subplots


    ranges = {}
    for i, metric in enumerate(metrics):
        yrange_start = BASELINES.get(metric, 0) * 0.9
        yrange_end = max(df_filtered[metric])
        # Adjust the end
        yrange_end = yrange_end + (yrange_end - yrange_start) * 0.2
        ranges[metric] = [yrange_start, yrange_end]
    
    file_name=f"plots/{output_file}.png"
    # fig.write_image(file_name)
    print(f"Plot saved to {file_name}")

    save_for_bar(output_file, df_filtered, metrics, default_metric, plot_name=plot_name, custom_layout=custom_layout, ranges=ranges)


metrics = [
    "agg_score",
    "mmlu/acc_norm",
    "arc/acc_norm",
    "openbookqa/acc_norm",
    "piqa/acc_norm",
    "hellaswag/acc_norm",
    "siqa/acc_norm",
    "winogrande/acc_norm",
]

plot_metric_comparison(df_full, 167000, metrics, output_file="edu-100k", plot_name="Evaluation results at 350B tokens", run_name_replacements={
    "FineWeb (ours)": "FineWeb"
})

Plot saved to plots/edu-100k.png


## Thresholds ablation

In [16]:
df

Unnamed: 0,runname,steps,agg_score,commonsense_qa/acc,commonsense_qa/acc_norm,hellaswag/acc,hellaswag/acc_norm,openbookqa/acc,openbookqa/acc_norm,piqa/acc,...,siqa/acc,siqa/acc_norm,winogrande/acc,winogrande/acc_norm,sciq/acc,sciq/acc_norm,arc/acc,arc/acc_norm,mmlu/acc,mmlu/acc_norm
0,C4,0,0.330893,0.186,0.233,0.272,0.258,0.166,0.286,0.542,...,0.367,0.362,0.516,0.497,0.208,0.202,0.2195,0.2510,0.230294,0.250147
1,C4,1000,0.355112,0.229,0.260,0.286,0.288,0.128,0.250,0.614,...,0.351,0.404,0.519,0.476,0.565,0.518,0.2680,0.2935,0.238951,0.250399
2,C4,2000,0.378435,0.268,0.278,0.312,0.330,0.122,0.276,0.646,...,0.375,0.400,0.509,0.500,0.676,0.577,0.3065,0.3230,0.247275,0.255482
3,C4,3000,0.387795,0.280,0.295,0.331,0.380,0.152,0.274,0.660,...,0.376,0.387,0.512,0.496,0.725,0.621,0.3175,0.3340,0.254534,0.267363
4,C4,4000,0.399320,0.296,0.298,0.351,0.406,0.168,0.282,0.676,...,0.382,0.404,0.522,0.503,0.723,0.618,0.3255,0.3470,0.254762,0.263563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,The Pile,163000,0.463789,0.379,0.349,0.441,0.555,0.240,0.366,0.701,...,0.405,0.388,0.585,0.560,0.875,0.820,0.4475,0.4450,0.299378,0.326313
1172,The Pile,164000,0.462758,0.369,0.344,0.438,0.552,0.248,0.348,0.708,...,0.395,0.401,0.577,0.567,0.874,0.806,0.4465,0.4355,0.302083,0.331563
1173,The Pile,165000,0.465026,0.383,0.350,0.438,0.553,0.234,0.352,0.707,...,0.400,0.401,0.569,0.556,0.874,0.811,0.4460,0.4455,0.305193,0.331708
1174,The Pile,166000,0.462349,0.377,0.346,0.440,0.557,0.228,0.346,0.711,...,0.398,0.398,0.572,0.558,0.877,0.811,0.4525,0.4385,0.301952,0.331295


In [18]:
token = os.getenv("HF_TOKEN")
repo_name = "HuggingFaceTB/loubna-ablations_faq"
runs_to_fetch = ["filtered_web_min_score_4_fix-seed-1-", "fineweb_2B_educational_minimum_score_3-seed-0-", "fineweb_2B_educational_regression-seed-6-", "fineweb_2024_10_all_2B-seed-6-"]
steps_to_fetch = "%1000"
prefix = "tb/ablations_faq-1p81G-"
metrics = ['commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',
                   'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']
agg_score_columns = metrics
column_name = "agg_score"
seed_merge_method = "mean"
oauth_token = token

runs_to_fetch = [prefix + run for run in runs_to_fetch]
df = fetch_run_results_simple(repo_name, runs_to_fetch, steps_to_fetch, prefix, agg_score_columns, column_name, seed_merge_method, oauth_token=token)

Fetching datafiles...: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00,  5.09it/s]
Loading evals data...: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:04<00:00,  5.71it/s]

Metrics saved to loubna-ablations_faq_metrics.csv





In [19]:
df['runname'] = df['runname'].replace({"filtered_web_min_score_4_fix-seed-1": "FW-Edu-threshold=4",
                                        "fineweb_2B_educational_minimum_score_3-seed-0": "FW-Edu-threshold=3",
                                        "fineweb_2B_educational_regression-seed-6": "FW-Edu-threshold=2",
                                        "fineweb_2024_10_all_2B-seed-6": "FineWeb (FW)"}, regex=True)
df.tail()

Unnamed: 0,runname,seed,steps,agg_score,commonsense_qa/acc,commonsense_qa/acc_norm,hellaswag/acc,hellaswag/acc_norm,openbookqa/acc,openbookqa/acc_norm,...,siqa/acc,siqa/acc_norm,winogrande/acc,winogrande/acc_norm,all/acc,all/acc_norm,arc/acc,arc/acc_norm,mmlu/acc,mmlu/acc_norm
0,FineWeb (FW),0,4000,0.389983,0.275,0.281,0.352,0.383,0.152,0.286,...,0.365,0.385,0.505,0.493,0.265054,0.281046,0.3265,0.3435,0.2505,0.264368
0,FineWeb (FW),0,5000,0.397987,0.303,0.297,0.349,0.397,0.154,0.29,...,0.375,0.383,0.509,0.502,0.268548,0.282678,0.334,0.356,0.253134,0.264896
0,FineWeb (FW),0,6000,0.403954,0.317,0.319,0.359,0.416,0.166,0.284,...,0.379,0.4,0.516,0.49,0.268197,0.286678,0.333,0.359,0.252102,0.268633
0,FineWeb (FW),0,7000,0.404859,0.298,0.31,0.367,0.424,0.176,0.29,...,0.382,0.396,0.511,0.494,0.271701,0.289459,0.325,0.351,0.256203,0.271874
0,FineWeb (FW),0,8000,0.403283,0.33,0.319,0.364,0.412,0.176,0.276,...,0.383,0.403,0.51,0.493,0.267533,0.287018,0.3295,0.351,0.251046,0.269266


In [17]:
df["runname"]

0             C4
1             C4
2             C4
3             C4
4             C4
          ...   
1171    The Pile
1172    The Pile
1173    The Pile
1174    The Pile
1175    The Pile
Name: runname, Length: 1176, dtype: object

In [20]:

metrics = [
    "agg_score",
    "mmlu/acc_norm",
    "arc/acc_norm",
    "openbookqa/acc_norm",
    "piqa/acc_norm",
    "hellaswag/acc_norm",
    "siqa/acc_norm",
    "winogrande/acc_norm",
]
plot_metric_comparison(df, 8000, metrics, output_file="edu-8k", plot_name="FineWeb-Edu thresholding", custom_layout={
    "xaxis": {
        "title": {
            "standoff": 60,
            "text": "Dataset"
        },
        "tickangle": 30
    },
    "margin": {
        "b": 120
    }
})

Plot saved to plots/edu-8k.png
