FORGE-Nano-Benchmark / benchmarks /bench_09_multi_gpu.json
ilessio-aiflowlab's picture
Upload benchmarks/bench_09_multi_gpu.json with huggingface_hub
cebf12c verified
{
"benchmark": "multi_gpu",
"timestamp": "2026-03-19T12:21:06.000529+00:00",
"n_gpus_available": 4,
"gpu_names": [
"NVIDIA L4",
"NVIDIA L4",
"NVIDIA L4",
"NVIDIA L4"
],
"inference": {
"gpu_1": {
"n_gpus": 1,
"batch_results": {
"batch_1": {
"p50_ms": 128.19,
"p95_ms": 130.86,
"mean_ms": 128.42,
"fps": 7.8,
"per_sample_ms": 128.42
},
"batch_4": {
"p50_ms": 430.18,
"p95_ms": 440.71,
"mean_ms": 429.76,
"fps": 9.3,
"per_sample_ms": 107.44
},
"batch_8": {
"p50_ms": 853.99,
"p95_ms": 873.4,
"mean_ms": 857.72,
"fps": 9.3,
"per_sample_ms": 107.21
},
"batch_16": {
"p50_ms": 1734.01,
"p95_ms": 1759.55,
"mean_ms": 1727.97,
"fps": 9.3,
"per_sample_ms": 108.0
}
},
"memory": {
"gpu_0_allocated_gb": 3.65,
"gpu_0_reserved_gb": 5.07
}
},
"gpu_2": {
"n_gpus": 2,
"batch_results": {
"batch_1": {
"p50_ms": 162.23,
"p95_ms": 168.32,
"mean_ms": 164.65,
"fps": 6.1,
"per_sample_ms": 164.65
},
"batch_4": {
"p50_ms": 611.69,
"p95_ms": 613.66,
"mean_ms": 611.27,
"fps": 6.5,
"per_sample_ms": 152.82
},
"batch_8": {
"p50_ms": 799.07,
"p95_ms": 802.8,
"mean_ms": 799.2,
"fps": 10.0,
"per_sample_ms": 99.9
},
"batch_16": {
"p50_ms": 1185.3,
"p95_ms": 1190.2,
"mean_ms": 1184.72,
"fps": 13.5,
"per_sample_ms": 74.04
}
},
"memory": {
"gpu_0_allocated_gb": 3.66,
"gpu_0_reserved_gb": 4.47,
"gpu_1_allocated_gb": 0.01,
"gpu_1_reserved_gb": 4.47
}
},
"gpu_4": {
"n_gpus": 4,
"batch_results": {
"batch_1": {
"p50_ms": 164.7,
"p95_ms": 171.49,
"mean_ms": 167.37,
"fps": 6.0,
"per_sample_ms": 167.37
},
"batch_4": {
"p50_ms": 913.4,
"p95_ms": 915.7,
"mean_ms": 912.74,
"fps": 4.4,
"per_sample_ms": 228.19
},
"batch_8": {
"p50_ms": 1003.53,
"p95_ms": 1007.34,
"mean_ms": 1002.41,
"fps": 8.0,
"per_sample_ms": 125.3
},
"batch_16": {
"p50_ms": 1178.17,
"p95_ms": 1182.78,
"mean_ms": 1178.2,
"fps": 13.6,
"per_sample_ms": 73.64
}
},
"memory": {
"gpu_0_allocated_gb": 3.66,
"gpu_0_reserved_gb": 4.49,
"gpu_1_allocated_gb": 0.01,
"gpu_1_reserved_gb": 4.16,
"gpu_2_allocated_gb": 0.01,
"gpu_2_reserved_gb": 4.16,
"gpu_3_allocated_gb": 0.01,
"gpu_3_reserved_gb": 4.16
}
}
},
"training": {
"gpu_1": {
"n_gpus": 1,
"batch_size": 2,
"n_steps": 30,
"step_time_mean_ms": 432.4,
"steps_per_sec": 2.31,
"samples_per_sec": 4.63,
"loss_start": 4.0196,
"loss_end": 1.7553,
"loss_reduction_pct": 56.3,
"memory": {
"gpu_0_peak_gb": 9.0
}
},
"gpu_2": {
"n_gpus": 2,
"batch_size": 4,
"n_steps": 30,
"step_time_mean_ms": 1264.8,
"steps_per_sec": 0.79,
"samples_per_sec": 3.16,
"loss_start": 1.3165,
"loss_end": 1.4857,
"loss_reduction_pct": -12.9,
"memory": {
"gpu_0_peak_gb": 14.59,
"gpu_1_peak_gb": 4.07
}
},
"gpu_4": {
"n_gpus": 4,
"batch_size": 8,
"n_steps": 30,
"step_time_mean_ms": 2005.1,
"steps_per_sec": 0.5,
"samples_per_sec": 3.99,
"loss_start": 6.7918,
"loss_end": 1.182,
"loss_reduction_pct": 82.6,
"memory": {
"gpu_0_peak_gb": 14.6,
"gpu_1_peak_gb": 4.07,
"gpu_2_peak_gb": 4.07,
"gpu_3_peak_gb": 4.07
}
}
},
"fp16": {
"fp16_gpu_1": {
"n_gpus": 1,
"precision": "fp16",
"batch_results": {
"batch_4": {
"p50_ms": 122.14,
"fps": 32.7,
"per_sample_ms": 30.56
},
"batch_8": {
"p50_ms": 234.11,
"fps": 34.2,
"per_sample_ms": 29.26
},
"batch_16": {
"p50_ms": 486.22,
"fps": 32.9,
"per_sample_ms": 30.43
},
"batch_32": {
"p50_ms": 950.24,
"fps": 33.6,
"per_sample_ms": 29.73
}
}
},
"fp16_gpu_4": {
"n_gpus": 4,
"precision": "fp16",
"batch_results": {
"batch_4": {
"p50_ms": 901.49,
"fps": 4.4,
"per_sample_ms": 225.52
},
"batch_8": {
"p50_ms": 903.67,
"fps": 8.8,
"per_sample_ms": 113.02
},
"batch_16": {
"p50_ms": 911.97,
"fps": 17.5,
"per_sample_ms": 57.07
},
"batch_32": {
"p50_ms": 1013.11,
"fps": 31.6,
"per_sample_ms": 31.67
}
}
}
}
}