Codette-Reasoning / correctness_benchmark_results.json
Jonathan Harrison
Full Codette codebase sync — transparency release
74f2af5
{
"timestamp": 1774284729.3695014,
"results": {
"Phase_6_Only": {
"overall_accuracy": 0.5,
"accuracy_by_category": {
"factual_easy": 0.5,
"factual_medium": 1.0,
"conceptual_medium": 0.5,
"reasoning_medium": 1.0,
"tricky_medium": 0.0,
"nuanced_hard": 0.0,
"meta_loop_prone": 0.5
},
"accuracy_by_difficulty": {
"1": 0.5,
"2": 0.625,
"3": 0.25
},
"avg_latency_ms": 0.2077141080583845,
"total_tests": 14,
"correct_count": 7,
"category_stats": {
"factual_easy": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.809056854248047
},
"factual_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.10834465026855469
},
"conceptual_medium": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.10786781311035157
},
"reasoning_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.10739097595214844
},
"tricky_medium": {
"accuracy": 0.0,
"count": 2,
"avg_latency_ms": 0.10751018524169922
},
"nuanced_hard": {
"accuracy": 0.0,
"count": 2,
"avg_latency_ms": 0.10751018524169922
},
"meta_loop_prone": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.10631809234619141
}
}
},
"Phase_6_Plus_13": {
"overall_accuracy": 0.7857142857142857,
"accuracy_by_category": {
"factual_easy": 1.0,
"factual_medium": 1.0,
"conceptual_medium": 1.0,
"reasoning_medium": 1.0,
"tricky_medium": 1.0,
"nuanced_hard": 0.5,
"meta_loop_prone": 0.0
},
"accuracy_by_difficulty": {
"1": 1.0,
"2": 1.0,
"3": 0.25
},
"avg_latency_ms": 0.10701631818498884,
"total_tests": 14,
"correct_count": 11,
"category_stats": {
"factual_easy": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.10751018524169922
},
"factual_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.10667572021484376
},
"conceptual_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.10643730163574219
},
"reasoning_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.10762939453125
},
"tricky_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.10727176666259766
},
"nuanced_hard": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1070333480834961
},
"meta_loop_prone": {
"accuracy": 0.0,
"count": 2,
"avg_latency_ms": 0.10655651092529297
}
}
},
"Phase_6_Plus_13_Plus_14": {
"overall_accuracy": 0.5714285714285714,
"accuracy_by_category": {
"factual_easy": 0.5,
"factual_medium": 1.0,
"conceptual_medium": 0.5,
"reasoning_medium": 0.5,
"tricky_medium": 1.0,
"nuanced_hard": 0.5,
"meta_loop_prone": 0.0
},
"accuracy_by_difficulty": {
"1": 0.5,
"2": 0.75,
"3": 0.25
},
"avg_latency_ms": 0.10691413879394532,
"total_tests": 14,
"correct_count": 8,
"category_stats": {
"factual_easy": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.10715255737304688
},
"factual_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.10667572021484376
},
"conceptual_medium": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.10655651092529297
},
"reasoning_medium": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.10727176666259766
},
"tricky_medium": {
"accuracy": 1.0,
"count": 2,
"avg_latency_ms": 0.1070333480834961
},
"nuanced_hard": {
"accuracy": 0.5,
"count": 2,
"avg_latency_ms": 0.1070333480834961
},
"meta_loop_prone": {
"accuracy": 0.0,
"count": 2,
"avg_latency_ms": 0.10667572021484376
}
}
}
},
"summary": {
"phase6_accuracy": 0.5,
"phase6_13_accuracy": 0.7857142857142857,
"phase6_13_14_accuracy": 0.5714285714285714,
"improvement_13_pct": 57.14285714285714,
"improvement_14_pct": -27.272727272727277,
"total_improvement_pct": 138.0952380952381
}
}