|
|
export function analyze(caseResults: any[], tries: number) { |
|
|
|
|
|
type TaskKey = string; |
|
|
const taskKeyFor = (r: any): TaskKey => |
|
|
`${r.test_case}::${r.result?.apiType}`; |
|
|
|
|
|
const successesByTask: Map<TaskKey, Map<number, boolean>> = new Map(); |
|
|
|
|
|
|
|
|
let wrongInputToolCalls = 0; |
|
|
|
|
|
|
|
|
const totalByApiType: Record<string, number> = {}; |
|
|
const invalidByApiType: Record<string, number> = {}; |
|
|
|
|
|
for (const r of caseResults) { |
|
|
if (!r?.result || typeof r.result.apiType !== "string") continue; |
|
|
|
|
|
|
|
|
let attemptIndex: number | undefined; |
|
|
if (typeof r.run_id === "string") { |
|
|
const parts = r.run_id.split("_"); |
|
|
const k = Number(parts[1]); |
|
|
if (Number.isFinite(k)) attemptIndex = k; |
|
|
} |
|
|
|
|
|
const key = taskKeyFor(r); |
|
|
if (!successesByTask.has(key)) successesByTask.set(key, new Map()); |
|
|
if (attemptIndex != null) { |
|
|
successesByTask.get(key)!.set(attemptIndex, Boolean(r.success)); |
|
|
} |
|
|
|
|
|
const d = r.result.toolCallingDetails ?? {}; |
|
|
const calledToolAtLeastOnce = Boolean(d.calledToolAtLeastOnce); |
|
|
const calledToolWithRightSchema = Boolean(d.calledToolWithRightSchema); |
|
|
const calledToolWithRightArguments = Boolean( |
|
|
d.calledToolWithRightArguments |
|
|
); |
|
|
if ( |
|
|
calledToolAtLeastOnce && |
|
|
calledToolWithRightSchema && |
|
|
!calledToolWithRightArguments |
|
|
) { |
|
|
wrongInputToolCalls++; |
|
|
} |
|
|
|
|
|
|
|
|
const apiType = r.result.apiType as string; |
|
|
totalByApiType[apiType] = (totalByApiType[apiType] ?? 0) + 1; |
|
|
const isValidResponse = r.result.validResponse === true; |
|
|
if (!isValidResponse) { |
|
|
invalidByApiType[apiType] = (invalidByApiType[apiType] ?? 0) + 1; |
|
|
} |
|
|
} |
|
|
|
|
|
const totalTasks = successesByTask.size; |
|
|
|
|
|
|
|
|
const passAtKByK: number[] = []; |
|
|
const passHatKByK: number[] = []; |
|
|
|
|
|
for (let k = 1; k <= tries; k++) { |
|
|
let tasksSuccessfulK = 0; |
|
|
let tasksAllSuccessfulK = 0; |
|
|
|
|
|
for (const [, attemptsMap] of successesByTask) { |
|
|
let anySuccess = false; |
|
|
let allSuccess = true; |
|
|
for (let i = 0; i < k; i++) { |
|
|
const v = attemptsMap.get(i) === true; |
|
|
anySuccess = anySuccess || v; |
|
|
if (!v) allSuccess = false; |
|
|
} |
|
|
if (anySuccess) tasksSuccessfulK++; |
|
|
if (allSuccess) tasksAllSuccessfulK++; |
|
|
} |
|
|
|
|
|
const passAtK = totalTasks > 0 ? tasksSuccessfulK / totalTasks : 0; |
|
|
const passHatK = totalTasks > 0 ? tasksAllSuccessfulK / totalTasks : 0; |
|
|
passAtKByK.push(passAtK); |
|
|
passHatKByK.push(passHatK); |
|
|
} |
|
|
|
|
|
|
|
|
const passAtK = passAtKByK[tries - 1] ?? 0; |
|
|
const passHatK = passHatKByK[tries - 1] ?? 0; |
|
|
|
|
|
return { |
|
|
totalTasks, |
|
|
passAtKByK, |
|
|
passHatKByK, |
|
|
passAtK, |
|
|
passHatK, |
|
|
wrongInputToolCalls, |
|
|
|
|
|
invalidByApiType, |
|
|
totalByApiType, |
|
|
}; |
|
|
} |
|
|
|
|
|
export function printAnalysis( |
|
|
stats: ReturnType<typeof analyze>, |
|
|
caseResults: any[], |
|
|
provider: string, |
|
|
selectedLines: string[], |
|
|
tries: number, |
|
|
skipped: number, |
|
|
analysisFile: string |
|
|
) { |
|
|
const formatPerK = (arr: number[]) => |
|
|
Array.from({ length: tries }, (_, i) => { |
|
|
const v = arr[i] ?? 0; |
|
|
return `${i + 1}=${v.toFixed(3)}`; |
|
|
}).join(", "); |
|
|
|
|
|
console.log("Summary:"); |
|
|
console.log(` Provider: ${provider}`); |
|
|
console.log(` Total input cases: ${selectedLines.length}`); |
|
|
console.log(` Tries: ${tries}`); |
|
|
console.log(` Total tasks: ${stats.totalTasks}`); |
|
|
console.log(` Total runs: ${caseResults.length}`); |
|
|
|
|
|
if ((stats.totalByApiType["responses"] ?? 0) > 0) { |
|
|
const bad = stats.invalidByApiType["responses"] ?? 0; |
|
|
const tot = stats.totalByApiType["responses"] ?? 0; |
|
|
console.log(` Invalid Responses API responses: ${bad} (out of ${tot})`); |
|
|
} |
|
|
if ((stats.totalByApiType["chat"] ?? 0) > 0) { |
|
|
const bad = stats.invalidByApiType["chat"] ?? 0; |
|
|
const tot = stats.totalByApiType["chat"] ?? 0; |
|
|
console.log( |
|
|
` Invalid Chat Completions API responses: ${bad} (out of ${tot})` |
|
|
); |
|
|
} |
|
|
console.log(` pass@k (k=1..${tries}): ${formatPerK(stats.passAtKByK)}`); |
|
|
console.log(` pass^k (k=1..${tries}): ${formatPerK(stats.passHatKByK)}`); |
|
|
console.log(` pass@k (k=${tries}): ${stats.passAtK.toFixed(3)}`); |
|
|
console.log(` pass^k (k=${tries}): ${stats.passHatK.toFixed(3)}`); |
|
|
console.log(` Wrong-input tool calls: ${stats.wrongInputToolCalls}`); |
|
|
console.log(` Invalid cases.jsonl lines: ${skipped}`); |
|
|
console.log(` Analysis written to ${analysisFile}`); |
|
|
} |
|
|
|