Update benchmark.py
Browse files- benchmark.py +20 -0
benchmark.py
CHANGED
|
@@ -141,12 +141,16 @@ class BenchmarkMetrics:
|
|
| 141 |
self.prefill_time_std = float(np.std(self.prefill_times))
|
| 142 |
self.prefill_time_ci = self._bootstrap_ci(self.prefill_times, config)
|
| 143 |
self.prefill_tokens_per_sec = config.prefill_length / self.prefill_time_mean if self.prefill_time_mean > 0 else 0.0
|
|
|
|
|
|
|
| 144 |
|
| 145 |
if self.prefill_peak_memories:
|
| 146 |
memories_mb = [m / (1024 * 1024) for m in self.prefill_peak_memories]
|
| 147 |
self.prefill_peak_memory_mean_mb = float(np.mean(memories_mb))
|
| 148 |
self.prefill_peak_memory_std_mb = float(np.std(memories_mb))
|
| 149 |
self.prefill_peak_memory_ci_mb = self._bootstrap_ci(memories_mb, config)
|
|
|
|
|
|
|
| 150 |
|
| 151 |
if self.decode_times:
|
| 152 |
self.decode_time_per_token_mean_ms = float(np.mean(self.decode_times) * 1000)
|
|
@@ -155,6 +159,8 @@ class BenchmarkMetrics:
|
|
| 155 |
self.decode_tokens_per_sec = 1.0 / np.mean(self.decode_times) if self.decode_times else 0.0
|
| 156 |
self.decode_time_p50_ms = float(np.percentile(self.decode_times, 50) * 1000)
|
| 157 |
self.decode_time_p95_ms = float(np.percentile(self.decode_times, 95) * 1000)
|
|
|
|
|
|
|
| 158 |
|
| 159 |
# Calculate end-to-end throughput
|
| 160 |
if self.prefill_time_mean > 0 and self.decode_time_per_token_mean_ms > 0:
|
|
@@ -165,23 +171,37 @@ class BenchmarkMetrics:
|
|
| 165 |
|
| 166 |
if self.decode_peak_memories:
|
| 167 |
self.decode_peak_memory_mean_mb = float(np.mean(self.decode_peak_memories) / (1024 * 1024))
|
|
|
|
|
|
|
| 168 |
|
| 169 |
if self.prefill_perplexities:
|
| 170 |
self.prefill_perplexity_mean = float(np.mean(self.prefill_perplexities))
|
| 171 |
self.prefill_perplexity_std = float(np.std(self.prefill_perplexities))
|
| 172 |
self.prefill_perplexity_ci = self._bootstrap_ci(self.prefill_perplexities, config)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
if self.generation_perplexities:
|
| 175 |
self.generation_perplexity_mean = float(np.mean(self.generation_perplexities))
|
| 176 |
self.generation_perplexity_std = float(np.std(self.generation_perplexities))
|
| 177 |
self.generation_perplexity_ci = self._bootstrap_ci(self.generation_perplexities, config)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
if self.compression_ratios:
|
| 180 |
self.compression_ratio_mean = float(np.mean(self.compression_ratios))
|
| 181 |
self.compression_ratio_std = float(np.std(self.compression_ratios))
|
|
|
|
|
|
|
| 182 |
|
| 183 |
if self.kv_cache_memory_samples_mb:
|
| 184 |
self.kv_cache_memory_mb = float(np.mean(self.kv_cache_memory_samples_mb))
|
|
|
|
|
|
|
| 185 |
|
| 186 |
except Exception as e:
|
| 187 |
logger.error(f"Error calculating statistics: {e}")
|
|
|
|
| 141 |
self.prefill_time_std = float(np.std(self.prefill_times))
|
| 142 |
self.prefill_time_ci = self._bootstrap_ci(self.prefill_times, config)
|
| 143 |
self.prefill_tokens_per_sec = config.prefill_length / self.prefill_time_mean if self.prefill_time_mean > 0 else 0.0
|
| 144 |
+
else:
|
| 145 |
+
logger.debug("No prefill time data available")
|
| 146 |
|
| 147 |
if self.prefill_peak_memories:
|
| 148 |
memories_mb = [m / (1024 * 1024) for m in self.prefill_peak_memories]
|
| 149 |
self.prefill_peak_memory_mean_mb = float(np.mean(memories_mb))
|
| 150 |
self.prefill_peak_memory_std_mb = float(np.std(memories_mb))
|
| 151 |
self.prefill_peak_memory_ci_mb = self._bootstrap_ci(memories_mb, config)
|
| 152 |
+
else:
|
| 153 |
+
logger.debug("No prefill memory data available")
|
| 154 |
|
| 155 |
if self.decode_times:
|
| 156 |
self.decode_time_per_token_mean_ms = float(np.mean(self.decode_times) * 1000)
|
|
|
|
| 159 |
self.decode_tokens_per_sec = 1.0 / np.mean(self.decode_times) if self.decode_times else 0.0
|
| 160 |
self.decode_time_p50_ms = float(np.percentile(self.decode_times, 50) * 1000)
|
| 161 |
self.decode_time_p95_ms = float(np.percentile(self.decode_times, 95) * 1000)
|
| 162 |
+
else:
|
| 163 |
+
logger.debug("No decode time data available")
|
| 164 |
|
| 165 |
# Calculate end-to-end throughput
|
| 166 |
if self.prefill_time_mean > 0 and self.decode_time_per_token_mean_ms > 0:
|
|
|
|
| 171 |
|
| 172 |
if self.decode_peak_memories:
|
| 173 |
self.decode_peak_memory_mean_mb = float(np.mean(self.decode_peak_memories) / (1024 * 1024))
|
| 174 |
+
else:
|
| 175 |
+
logger.debug("No decode memory data available")
|
| 176 |
|
| 177 |
if self.prefill_perplexities:
|
| 178 |
self.prefill_perplexity_mean = float(np.mean(self.prefill_perplexities))
|
| 179 |
self.prefill_perplexity_std = float(np.std(self.prefill_perplexities))
|
| 180 |
self.prefill_perplexity_ci = self._bootstrap_ci(self.prefill_perplexities, config)
|
| 181 |
+
logger.info(f"Calculated prefill perplexity: mean={self.prefill_perplexity_mean:.2f}, "
|
| 182 |
+
f"std={self.prefill_perplexity_std:.2f}, samples={len(self.prefill_perplexities)}")
|
| 183 |
+
else:
|
| 184 |
+
logger.warning("No prefill perplexity data available")
|
| 185 |
|
| 186 |
if self.generation_perplexities:
|
| 187 |
self.generation_perplexity_mean = float(np.mean(self.generation_perplexities))
|
| 188 |
self.generation_perplexity_std = float(np.std(self.generation_perplexities))
|
| 189 |
self.generation_perplexity_ci = self._bootstrap_ci(self.generation_perplexities, config)
|
| 190 |
+
logger.info(f"Calculated generation perplexity: mean={self.generation_perplexity_mean:.2f}, "
|
| 191 |
+
f"std={self.generation_perplexity_std:.2f}, samples={len(self.generation_perplexities)}")
|
| 192 |
+
else:
|
| 193 |
+
logger.warning("No generation perplexity data available")
|
| 194 |
|
| 195 |
if self.compression_ratios:
|
| 196 |
self.compression_ratio_mean = float(np.mean(self.compression_ratios))
|
| 197 |
self.compression_ratio_std = float(np.std(self.compression_ratios))
|
| 198 |
+
else:
|
| 199 |
+
logger.debug("No compression ratio data available")
|
| 200 |
|
| 201 |
if self.kv_cache_memory_samples_mb:
|
| 202 |
self.kv_cache_memory_mb = float(np.mean(self.kv_cache_memory_samples_mb))
|
| 203 |
+
else:
|
| 204 |
+
logger.debug("No KV cache memory data available")
|
| 205 |
|
| 206 |
except Exception as e:
|
| 207 |
logger.error(f"Error calculating statistics: {e}")
|