File size: 4,912 Bytes
b4d34a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
export function analyze(caseResults: any[], tries: number) {
  // Group results by unique task: test_case + apiType
  type TaskKey = string;
  const taskKeyFor = (r: any): TaskKey =>
    `${r.test_case}::${r.result?.apiType}`;

  const successesByTask: Map<TaskKey, Map<number, boolean>> = new Map();

  // Count wrong-input tool calls (schema correct but incorrect arguments)
  let wrongInputToolCalls = 0;

  // Count invalid response shapes per API type
  const totalByApiType: Record<string, number> = {};
  const invalidByApiType: Record<string, number> = {};

  for (const r of caseResults) {
    if (!r?.result || typeof r.result.apiType !== "string") continue;

    // Parse attempt index from run_id `${i}_${k}` safely
    let attemptIndex: number | undefined;
    if (typeof r.run_id === "string") {
      const parts = r.run_id.split("_");
      const k = Number(parts[1]);
      if (Number.isFinite(k)) attemptIndex = k;
    }

    const key = taskKeyFor(r);
    if (!successesByTask.has(key)) successesByTask.set(key, new Map());
    if (attemptIndex != null) {
      successesByTask.get(key)!.set(attemptIndex, Boolean(r.success));
    }

    const d = r.result.toolCallingDetails ?? {};
    const calledToolAtLeastOnce = Boolean(d.calledToolAtLeastOnce);
    const calledToolWithRightSchema = Boolean(d.calledToolWithRightSchema);
    const calledToolWithRightArguments = Boolean(
      d.calledToolWithRightArguments
    );
    if (
      calledToolAtLeastOnce &&
      calledToolWithRightSchema &&
      !calledToolWithRightArguments
    ) {
      wrongInputToolCalls++;
    }

    // Track invalid/total per apiType for response shape
    const apiType = r.result.apiType as string;
    totalByApiType[apiType] = (totalByApiType[apiType] ?? 0) + 1;
    const isValidResponse = r.result.validResponse === true;
    if (!isValidResponse) {
      invalidByApiType[apiType] = (invalidByApiType[apiType] ?? 0) + 1;
    }
  }

  const totalTasks = successesByTask.size;

  // Compute pass@k and pass^k for k = 1..tries
  const passAtKByK: number[] = [];
  const passHatKByK: number[] = [];

  for (let k = 1; k <= tries; k++) {
    let tasksSuccessfulK = 0; // any success in first k attempts
    let tasksAllSuccessfulK = 0; // all success in first k attempts

    for (const [, attemptsMap] of successesByTask) {
      let anySuccess = false;
      let allSuccess = true;
      for (let i = 0; i < k; i++) {
        const v = attemptsMap.get(i) === true;
        anySuccess = anySuccess || v;
        if (!v) allSuccess = false;
      }
      if (anySuccess) tasksSuccessfulK++;
      if (allSuccess) tasksAllSuccessfulK++;
    }

    const passAtK = totalTasks > 0 ? tasksSuccessfulK / totalTasks : 0;
    const passHatK = totalTasks > 0 ? tasksAllSuccessfulK / totalTasks : 0;
    passAtKByK.push(passAtK);
    passHatKByK.push(passHatK);
  }

  // Convenience: final k=tries values
  const passAtK = passAtKByK[tries - 1] ?? 0;
  const passHatK = passHatKByK[tries - 1] ?? 0;

  return {
    totalTasks,
    passAtKByK,
    passHatKByK,
    passAtK,
    passHatK,
    wrongInputToolCalls,
    // New stats for invalid response shapes per API
    invalidByApiType,
    totalByApiType,
  };
}

export function printAnalysis(
  stats: ReturnType<typeof analyze>,
  caseResults: any[],
  provider: string,
  selectedLines: string[],
  tries: number,
  skipped: number,
  analysisFile: string
) {
  const formatPerK = (arr: number[]) =>
    Array.from({ length: tries }, (_, i) => {
      const v = arr[i] ?? 0;
      return `${i + 1}=${v.toFixed(3)}`;
    }).join(", ");

  console.log("Summary:");
  console.log(`  Provider: ${provider}`);
  console.log(`  Total input cases: ${selectedLines.length}`);
  console.log(`  Tries: ${tries}`);
  console.log(`  Total tasks: ${stats.totalTasks}`);
  console.log(`  Total runs: ${caseResults.length}`);
  // Conditionally print invalid response shape stats per API type
  if ((stats.totalByApiType["responses"] ?? 0) > 0) {
    const bad = stats.invalidByApiType["responses"] ?? 0;
    const tot = stats.totalByApiType["responses"] ?? 0;
    console.log(`  Invalid Responses API responses: ${bad} (out of ${tot})`);
  }
  if ((stats.totalByApiType["chat"] ?? 0) > 0) {
    const bad = stats.invalidByApiType["chat"] ?? 0;
    const tot = stats.totalByApiType["chat"] ?? 0;
    console.log(
      `  Invalid Chat Completions API responses: ${bad} (out of ${tot})`
    );
  }
  console.log(`  pass@k (k=1..${tries}): ${formatPerK(stats.passAtKByK)}`);
  console.log(`  pass^k (k=1..${tries}): ${formatPerK(stats.passHatKByK)}`);
  console.log(`  pass@k (k=${tries}): ${stats.passAtK.toFixed(3)}`);
  console.log(`  pass^k (k=${tries}): ${stats.passHatK.toFixed(3)}`);
  console.log(`  Wrong-input tool calls: ${stats.wrongInputToolCalls}`);
  console.log(`  Invalid cases.jsonl lines: ${skipped}`);
  console.log(`  Analysis written to ${analysisFile}`);
}