Spaces:
Paused
Paused
| #!/usr/bin/env python | |
| # HF Trainer benchmarking tool | |
| # | |
| # This tool can be used to run and compare multiple dimensions of the HF Trainers args. | |
| # | |
| # It then prints a report once in github format with all the information that needs to be shared | |
| # with others and second time in a console-friendly format, so it's easier to use for tuning things up. | |
| # | |
| # The main idea is: | |
| # | |
| # ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \ | |
| # --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \ | |
| # --target-metric-key train_samples_per_second | |
| # | |
| # The variations can be any command line argument that you want to compare and not just dtype as in | |
| # the example. | |
| # | |
| # --variations allows you to compare variations in multiple dimensions. | |
| # | |
| # as the first dimention has 2 options and the second 3 in our example, this will run the trainer 6 | |
| # times adding one of: | |
| # | |
| # 1. --tf32 0 --fp16 0 | |
| # 2. --tf32 0 --fp16 1 | |
| # 3. --tf32 0 --bf16 1 | |
| # 4. --tf32 1 --fp16 0 | |
| # 5. --tf32 1 --fp16 1 | |
| # 6. --tf32 1 --bf16 1 | |
| # | |
| # and print the results. This is just a cartesian product - and more than 2 dimensions can be used. | |
| # | |
| # If you want to rely on defaults, this: | |
| # --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' | |
| # is identical to this: | |
| # --variations '--tf32 0|--tf32 1' '|--fp16|--bf16' | |
| # | |
| # the leading empty variation in the 2nd dimension is a valid variation. | |
| # | |
| # So here we get the following 6 variations: | |
| # | |
| # 1. --tf32 0 | |
| # 2. --tf32 0 --fp16 | |
| # 3. --tf32 0 --bf16 | |
| # 4. --tf32 1 | |
| # 5. --tf32 1 --fp16 | |
| # 6. --tf32 1 --bf16 | |
| # | |
| # In this particular case we don't know what the default tf32 setting is as it's normally | |
| # pytorch-version dependent). That's why it's best to do an explicit setting of each variation: | |
| # `--tf32 0|--tf32 1` | |
| # | |
| # Here is a full example of a train: | |
| # | |
| # CUDA_VISIBLE_DEVICES=0 python ./scripts/benchmark/trainer-benchmark.py \ | |
| # --base-cmd \ | |
| # ' examples/pytorch/translation/run_translation.py --model_name_or_path t5-small \ | |
| # --output_dir output_dir --do_train --label_smoothing 0.1 --logging_strategy no \ | |
| # --save_strategy no --per_device_train_batch_size 32 --max_source_length 512 \ | |
| # --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \ | |
| # --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \ | |
| # --source_prefix "translate English to Romanian: " --warmup_steps 50 \ | |
| # --max_train_samples 20000 --dataloader_num_workers 2 ' \ | |
| # --target-metric-key train_samples_per_second --repeat-times 1 --variations \ | |
| # '|--fp16|--bf16' '--tf32 0|--tf32 1' --report-metric-keys train_loss \ | |
| # --repeat-times 1 --base-variation '--tf32 0' | |
| # | |
| # and here is a possible output: | |
| # | |
| # | |
| # | Variation | Train | Diff | Train | | |
| # | | samples | % | loss | | |
| # | | per | | | | |
| # | | second | | | | |
| # |:----------------|----------:|-------:|--------:| | |
| # | --tf32 0 | 285.11 | 0 | 2.51 | | |
| # | --tf32 1 | 342.09 | 20 | 2.51 | | |
| # | --fp16 --tf32 0 | 423.49 | 49 | 2.51 | | |
| # | --fp16 --tf32 1 | 423.13 | 48 | 2.51 | | |
| # | --bf16 --tf32 0 | 416.80 | 46 | 2.52 | | |
| # | --bf16 --tf32 1 | 415.87 | 46 | 2.52 | | |
| # | |
| # | |
| # So you can quickly compare the different outcomes. | |
| # | |
| # Typically running each experiment once is enough, but if the environment is unstable you can | |
| # re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results. | |
| # | |
| # By default it'll use the lowest result as the base line to use as 100% and then compare the rest to | |
| # it as can be seen from the table above, but you can also specify which combination is the one to use as | |
| # the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0' | |
| # | |
| # --target-metric-key is there to tell the program which metrics to compare - the different metric keys are | |
| # inside output_dir/all_results.json. e.g., to measure eval performance instead of train use: | |
| # --target-metric-key eval_samples_per_second | |
| # but of course you will need to adjust the --base-cmd value in the example to perform evaluation as | |
| # well (as currently it doesn't) | |
| # | |
| import argparse | |
| import datetime | |
| import io | |
| import itertools | |
| import json | |
| import math | |
| import os | |
| import platform | |
| import re | |
| import shlex | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| from statistics import fmean | |
| import pandas as pd | |
| import torch | |
| from tqdm import tqdm | |
| import transformers | |
| nan = float("nan") | |
| class Tee: | |
| """ | |
| A helper class to tee print's output into a file. | |
| Usage: | |
| sys.stdout = Tee(filename) | |
| """ | |
| def __init__(self, filename): | |
| self.stdout = sys.stdout | |
| self.file = open(filename, "a") | |
| def __getattr__(self, attr): | |
| return getattr(self.stdout, attr) | |
| def write(self, msg): | |
| self.stdout.write(msg) | |
| # strip tqdm codes | |
| self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M)) | |
| def get_original_command(max_width=80, full_python_path=False): | |
| """ | |
| Return the original command line string that can be replayed nicely and wrapped for 80 char width. | |
| Args: | |
| max_width (`int`, `optional`, defaults to 80): | |
| The width to wrap for. | |
| full_python_path (`bool`, `optional`, defaults to `False`): | |
| Whether to replicate the full path or just the last segment (i.e. `python`). | |
| """ | |
| cmd = [] | |
| # deal with critical env vars | |
| env_keys = ["CUDA_VISIBLE_DEVICES"] | |
| for key in env_keys: | |
| val = os.environ.get(key, None) | |
| if val is not None: | |
| cmd.append(f"{key}={val}") | |
| # python executable (not always needed if the script is executable) | |
| python = sys.executable if full_python_path else sys.executable.split("/")[-1] | |
| cmd.append(python) | |
| # now the normal args | |
| cmd += list(map(shlex.quote, sys.argv)) | |
| # split up into up to MAX_WIDTH lines with shell multi-line escapes | |
| lines = [] | |
| current_line = "" | |
| while len(cmd) > 0: | |
| current_line += f"{cmd.pop(0)} " | |
| if len(cmd) == 0 or len(current_line) + len(cmd[0]) + 1 > max_width - 1: | |
| lines.append(current_line) | |
| current_line = "" | |
| return "\\\n".join(lines) | |
| def get_base_command(args, output_dir): | |
| # unwrap multi-line input | |
| args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd) | |
| # remove --output_dir if any and set our own | |
| args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd) | |
| args.base_cmd += f" --output_dir {output_dir}" | |
| # ensure we have --overwrite_output_dir | |
| args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd) | |
| args.base_cmd += " --overwrite_output_dir" | |
| return [sys.executable] + shlex.split(args.base_cmd) | |
| def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose): | |
| # Enable to debug everything but the run itself, to do it fast and see the progress. | |
| # This is useful for debugging the output formatting quickly - we can remove it later once | |
| # everybody is happy with the output | |
| if 0: | |
| import random | |
| from time import sleep | |
| sleep(0) | |
| return dict( | |
| {k: random.uniform(0, 100) for k in metric_keys}, | |
| **{target_metric_key: random.choice([nan, 10.31, 100.2, 55.6666, 222.22222222])}, | |
| ) | |
| result = subprocess.run(cmd, capture_output=True, text=True) | |
| if verbose: | |
| print("STDOUT", result.stdout) | |
| print("STDERR", result.stderr) | |
| # save the streams | |
| prefix = variation.replace(" ", "-") | |
| with open(Path(output_dir) / f"log.{prefix}.stdout.txt", "w") as f: | |
| f.write(result.stdout) | |
| with open(Path(output_dir) / f"log.{prefix}.stderr.txt", "w") as f: | |
| f.write(result.stderr) | |
| if result.returncode != 0: | |
| if verbose: | |
| print("failed") | |
| return {target_metric_key: nan} | |
| with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f: | |
| metrics = json.load(f) | |
| # filter out just the keys we want | |
| return {k: v for k, v in metrics.items() if k in metric_keys} | |
| def process_run( | |
| id, | |
| cmd, | |
| variation_key, | |
| variation, | |
| longest_variation_len, | |
| target_metric_key, | |
| report_metric_keys, | |
| repeat_times, | |
| output_dir, | |
| verbose, | |
| ): | |
| results = [] | |
| metrics = [] | |
| preamble = f"{id}: {variation:<{longest_variation_len}}" | |
| outcome = f"{preamble}: " | |
| metric_keys = set(report_metric_keys + [target_metric_key]) | |
| for i in tqdm(range(repeat_times), desc=preamble, leave=False): | |
| single_run_metrics = process_run_single( | |
| id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose | |
| ) | |
| result = single_run_metrics[target_metric_key] | |
| if not math.isnan(result): | |
| metrics.append(single_run_metrics) | |
| results.append(result) | |
| outcome += "✓" | |
| else: | |
| outcome += "✘" | |
| outcome = f"\33[2K\r{outcome}" | |
| if len(metrics) > 0: | |
| mean_metrics = {k: fmean([x[k] for x in metrics]) for k in metrics[0].keys()} | |
| mean_target = round(mean_metrics[target_metric_key], 2) | |
| results_str = f"{outcome} {mean_target}" | |
| if len(metrics) > 1: | |
| results_str += f" {tuple(round(x, 2) for x in results)}" | |
| print(results_str) | |
| mean_metrics[variation_key] = variation | |
| return mean_metrics | |
| else: | |
| print(outcome) | |
| return {variation_key: variation, target_metric_key: nan} | |
| def get_versions(): | |
| properties = torch.cuda.get_device_properties(torch.device("cuda")) | |
| return f""" | |
| Datetime : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
| Software: | |
| transformers: {transformers.__version__} | |
| torch : {torch.__version__} | |
| cuda : {torch.version.cuda} | |
| python : {platform.python_version()} | |
| Hardware: | |
| {torch.cuda.device_count()} GPUs : {properties.name}, {properties.total_memory/2**30:0.2f}GB | |
| """ | |
| def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir): | |
| df = pd.DataFrame(results) | |
| variation_key = "variation" | |
| diff_key = "diff_%" | |
| sentinel_value = nan | |
| if base_variation is not None and len(df[df[variation_key] == base_variation]): | |
| # this may still return nan | |
| sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item() | |
| if math.isnan(sentinel_value): | |
| # as a fallback, use the minimal value as the sentinel | |
| sentinel_value = df.loc[df[target_metric_key] != nan][target_metric_key].min() | |
| # create diff column if possible | |
| if not math.isnan(sentinel_value): | |
| df[diff_key] = df.apply( | |
| lambda r: round(100 * (r[target_metric_key] - sentinel_value) / sentinel_value) | |
| if not math.isnan(r[target_metric_key]) | |
| else 0, | |
| axis="columns", | |
| ) | |
| # re-order columns | |
| cols = [variation_key, target_metric_key, diff_key, *report_metric_keys] | |
| df = df.reindex(cols, axis="columns") # reorder cols | |
| # capitalize | |
| df = df.rename(str.capitalize, axis="columns") | |
| # make the cols as narrow as possible | |
| df_github = df.rename(lambda c: c.replace("_", "<br>"), axis="columns") | |
| df_console = df.rename(lambda c: c.replace("_", "\n"), axis="columns") | |
| report = ["", "Copy between the cut-here-lines and paste as is to github or a forum"] | |
| report += ["----------8<-----------------8<--------"] | |
| report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")] | |
| report += ["```"] | |
| report += ["*** Setup:", get_versions()] | |
| report += ["*** The benchmark command line was:", get_original_command()] | |
| report += ["```"] | |
| report += ["----------8<-----------------8<--------"] | |
| report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")] | |
| print("\n\n".join(report)) | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--base-cmd", | |
| default=None, | |
| type=str, | |
| required=True, | |
| help="Base cmd", | |
| ) | |
| parser.add_argument( | |
| "--variations", | |
| default=None, | |
| type=str, | |
| nargs="+", | |
| required=True, | |
| help="Multi-dimensional variations, example: '|--fp16|--bf16' '|--tf32'", | |
| ) | |
| parser.add_argument( | |
| "--base-variation", | |
| default=None, | |
| type=str, | |
| help="Baseline variation to compare to. if None the minimal target value will be used to compare against", | |
| ) | |
| parser.add_argument( | |
| "--target-metric-key", | |
| default=None, | |
| type=str, | |
| required=True, | |
| help="Target metric key in output_dir/all_results.json, e.g., train_samples_per_second", | |
| ) | |
| parser.add_argument( | |
| "--report-metric-keys", | |
| default="", | |
| type=str, | |
| help="Report metric keys - other metric keys from output_dir/all_results.json to report, e.g., train_loss. Use a single argument e.g., 'train_loss train_samples", | |
| ) | |
| parser.add_argument( | |
| "--repeat-times", | |
| default=1, | |
| type=int, | |
| help="How many times to re-run each variation - an average will be reported", | |
| ) | |
| parser.add_argument( | |
| "--output_dir", | |
| default="output_benchmark", | |
| type=str, | |
| help="The output directory where all the benchmark reports will go to and additionally this directory will be used to override --output_dir in the script that is being benchmarked", | |
| ) | |
| parser.add_argument( | |
| "--verbose", | |
| default=False, | |
| action="store_true", | |
| help="Whether to show the outputs of each run or just the benchmark progress", | |
| ) | |
| args = parser.parse_args() | |
| output_dir = args.output_dir | |
| Path(output_dir).mkdir(exist_ok=True) | |
| base_cmd = get_base_command(args, output_dir) | |
| # split each dimension into its --foo variations | |
| dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations] | |
| # build a cartesian product of dimensions and convert those back into cmd-line arg strings, | |
| # while stripping white space for inputs that were empty | |
| variations = list(map(str.strip, map(" ".join, itertools.product(*dims)))) | |
| longest_variation_len = max(len(x) for x in variations) | |
| # split wanted keys | |
| report_metric_keys = args.report_metric_keys.split() | |
| # capture prints into a log file for convenience | |
| report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt" | |
| print(f"\nNote: each run's output is also logged under {output_dir}/log.*.std*.txt") | |
| print(f"and this script's output is also piped into {report_fn}") | |
| sys.stdout = Tee(report_fn) | |
| print(f"\n*** Running {len(variations)} benchmarks:") | |
| print(f"Base command: {' '.join(base_cmd)}") | |
| variation_key = "variation" | |
| results = [] | |
| for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)): | |
| cmd = base_cmd + variation.split() | |
| results.append( | |
| process_run( | |
| id + 1, | |
| cmd, | |
| variation_key, | |
| variation, | |
| longest_variation_len, | |
| args.target_metric_key, | |
| report_metric_keys, | |
| args.repeat_times, | |
| output_dir, | |
| args.verbose, | |
| ) | |
| ) | |
| process_results(results, args.target_metric_key, report_metric_keys, args.base_variation, output_dir) | |
| if __name__ == "__main__": | |
| main() | |