muthuk1 commited on
Commit
fe0369c
·
verified ·
1 Parent(s): 11bcc35

Add benchmark API route for live benchmark runs from dashboard

Browse files
Files changed (1) hide show
  1. web/src/app/api/benchmark/route.ts +201 -0
web/src/app/api/benchmark/route.ts ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextRequest, NextResponse } from "next/server";
2
+ import { callLLM, PROVIDERS, type ProviderId } from "@/lib/llm-providers";
3
+
4
+ export const runtime = "nodejs";
5
+ export const dynamic = "force-dynamic";
6
+
7
+ // Inline F1 computation (same as Python evaluation_layer)
8
+ function normalizeAnswer(s: string): string {
9
+ return s.toLowerCase()
10
+ .replace(/\b(a|an|the)\b/g, " ")
11
+ .replace(/[^\w\s]/g, "")
12
+ .replace(/\s+/g, " ")
13
+ .trim();
14
+ }
15
+
16
+ function computeF1(prediction: string, groundTruth: string): number {
17
+ const predTokens = normalizeAnswer(prediction).split(/\s+/).filter(Boolean);
18
+ const goldTokens = normalizeAnswer(groundTruth).split(/\s+/).filter(Boolean);
19
+ if (!predTokens.length && !goldTokens.length) return 1.0;
20
+ if (!predTokens.length || !goldTokens.length) return 0.0;
21
+ const predSet = new Map<string, number>();
22
+ predTokens.forEach(t => predSet.set(t, (predSet.get(t) || 0) + 1));
23
+ const goldSet = new Map<string, number>();
24
+ goldTokens.forEach(t => goldSet.set(t, (goldSet.get(t) || 0) + 1));
25
+ let common = 0;
26
+ for (const [token, count] of predSet) {
27
+ common += Math.min(count, goldSet.get(token) || 0);
28
+ }
29
+ if (common === 0) return 0.0;
30
+ const precision = common / predTokens.length;
31
+ const recall = common / goldTokens.length;
32
+ return (2 * precision * recall) / (precision + recall);
33
+ }
34
+
35
+ function computeEM(prediction: string, groundTruth: string): number {
36
+ return normalizeAnswer(prediction) === normalizeAnswer(groundTruth) ? 1.0 : 0.0;
37
+ }
38
+
39
+ // Sample HotpotQA questions (embedded to avoid dataset dependency in Next.js)
40
+ const HOTPOTQA_SAMPLES = [
41
+ { question: "Were Scott Derrickson and Ed Wood of the same nationality?", answer: "Yes", type: "comparison" },
42
+ { question: "Which magazine was started first Arthur's Magazine or First for Women?", answer: "Arthur's Magazine", type: "comparison" },
43
+ { question: "Were Pavel Urysohn and Leonid Levin known for the same type of work?", answer: "Yes", type: "comparison" },
44
+ { question: "What film has the director who is of Noth Korean descent?", answer: "In the Line of Duty: The FBI Murders", type: "bridge" },
45
+ { question: "Which tennis player won more Grand Slam titles, Venus Williams or Serena Williams?", answer: "Serena Williams", type: "comparison" },
46
+ { question: "Are the Shinano River and the Tone River both located in Japan?", answer: "Yes", type: "comparison" },
47
+ { question: "What is the capital of the country that contains the Buda Castle?", answer: "Budapest", type: "bridge" },
48
+ { question: "Who was born first, Albert Einstein or Nikola Tesla?", answer: "Nikola Tesla", type: "comparison" },
49
+ { question: "What nationality is the director of the film 'Parasite'?", answer: "South Korean", type: "bridge" },
50
+ { question: "Are both the University of Chicago and Northwestern University in the same state?", answer: "Yes", type: "comparison" },
51
+ ];
52
+
53
+ interface BenchmarkRequest {
54
+ numSamples?: number;
55
+ provider?: ProviderId;
56
+ model?: string;
57
+ }
58
+
59
+ export async function POST(req: NextRequest) {
60
+ const body: BenchmarkRequest = await req.json();
61
+ const provider = body.provider || "anthropic";
62
+ const model = body.model;
63
+ const numSamples = Math.min(body.numSamples || 10, HOTPOTQA_SAMPLES.length);
64
+
65
+ const providerConfig = PROVIDERS[provider];
66
+ const hasKey = providerConfig?.isLocal || !providerConfig?.requiresApiKey || !!process.env[providerConfig?.apiKeyEnv || ""];
67
+
68
+ const results: Record<string, unknown>[] = [];
69
+ let totalBaselineF1 = 0, totalGraphragF1 = 0;
70
+ let totalBaselineEM = 0, totalGraphragEM = 0;
71
+ let totalBaselineTokens = 0, totalGraphragTokens = 0;
72
+ let totalBaselineCost = 0, totalGraphragCost = 0;
73
+ let totalBaselineLatency = 0, totalGraphragLatency = 0;
74
+ let bridgeCount = 0, compCount = 0;
75
+ let bridgeBaseF1 = 0, bridgeGraphF1 = 0;
76
+ let compBaseF1 = 0, compGraphF1 = 0;
77
+
78
+ for (let i = 0; i < numSamples; i++) {
79
+ const sample = HOTPOTQA_SAMPLES[i];
80
+
81
+ if (!hasKey) {
82
+ // Demo mode: generate plausible mock results
83
+ const bF1 = 0.4 + Math.random() * 0.3;
84
+ const gF1 = bF1 + 0.05 + Math.random() * 0.15;
85
+ const bTokens = 700 + Math.floor(Math.random() * 400);
86
+ const gTokens = 1800 + Math.floor(Math.random() * 800);
87
+ results.push({
88
+ idx: i, query: sample.question, gold: sample.answer, type: sample.type,
89
+ baseline_f1: +bF1.toFixed(4), graphrag_f1: +gF1.toFixed(4),
90
+ baseline_em: Math.random() > 0.6 ? 1 : 0, graphrag_em: Math.random() > 0.5 ? 1 : 0,
91
+ baseline_tokens: bTokens, graphrag_tokens: gTokens,
92
+ });
93
+ totalBaselineF1 += bF1; totalGraphragF1 += gF1;
94
+ totalBaselineTokens += bTokens; totalGraphragTokens += gTokens;
95
+ if (sample.type === "bridge") { bridgeCount++; bridgeBaseF1 += bF1; bridgeGraphF1 += gF1; }
96
+ else { compCount++; compBaseF1 += bF1; compGraphF1 += gF1; }
97
+ continue;
98
+ }
99
+
100
+ try {
101
+ // Pipeline A: Baseline
102
+ const baseStart = Date.now();
103
+ const baseResp = await callLLM({
104
+ provider, model,
105
+ messages: [
106
+ { role: "system", content: "Answer the question concisely in 1-3 words if possible." },
107
+ { role: "user", content: sample.question },
108
+ ],
109
+ temperature: 0, maxTokens: 128,
110
+ });
111
+ const baseLat = Date.now() - baseStart;
112
+
113
+ // Pipeline B: GraphRAG (entity extraction + graph-context generation)
114
+ const graphStart = Date.now();
115
+ const entityResp = await callLLM({
116
+ provider, model,
117
+ messages: [
118
+ { role: "system", content: 'Extract entities and relationships relevant to this question. Return JSON: {"entities": [{"name": "...", "type": "..."}], "relations": [{"source": "...", "target": "...", "type": "..."}]}' },
119
+ { role: "user", content: sample.question },
120
+ ],
121
+ temperature: 0, maxTokens: 512, jsonMode: providerConfig?.supportsJSON,
122
+ });
123
+
124
+ let graphContext = "";
125
+ try {
126
+ const parsed = JSON.parse(entityResp.content);
127
+ const ents = (parsed.entities || []).map((e: {name:string; type:string}) => `- ${e.name} (${e.type})`).join("\n");
128
+ const rels = (parsed.relations || []).map((r: {source:string; target:string; type:string}) => `- ${r.source} → ${r.type} → ${r.target}`).join("\n");
129
+ graphContext = `Entities:\n${ents}\n\nRelationships:\n${rels}`;
130
+ } catch { graphContext = entityResp.content; }
131
+
132
+ const graphResp = await callLLM({
133
+ provider, model,
134
+ messages: [
135
+ { role: "system", content: "Using the knowledge graph context, answer concisely in 1-3 words if possible. Follow relationship chains." },
136
+ { role: "user", content: `Context:\n${graphContext}\n\nQuestion: ${sample.question}` },
137
+ ],
138
+ temperature: 0, maxTokens: 128,
139
+ });
140
+ const graphLat = Date.now() - graphStart;
141
+
142
+ const bF1 = computeF1(baseResp.content, sample.answer);
143
+ const gF1 = computeF1(graphResp.content, sample.answer);
144
+ const bEM = computeEM(baseResp.content, sample.answer);
145
+ const gEM = computeEM(graphResp.content, sample.answer);
146
+ const gTokens = entityResp.totalTokens + graphResp.totalTokens;
147
+ const gCost = entityResp.costUsd + graphResp.costUsd;
148
+
149
+ results.push({
150
+ idx: i, query: sample.question, gold: sample.answer, type: sample.type,
151
+ baseline_answer: baseResp.content, graphrag_answer: graphResp.content,
152
+ baseline_f1: +bF1.toFixed(4), graphrag_f1: +gF1.toFixed(4),
153
+ baseline_em: bEM, graphrag_em: gEM,
154
+ baseline_tokens: baseResp.totalTokens, graphrag_tokens: gTokens,
155
+ baseline_cost: baseResp.costUsd, graphrag_cost: gCost,
156
+ baseline_latency: baseLat, graphrag_latency: graphLat,
157
+ });
158
+
159
+ totalBaselineF1 += bF1; totalGraphragF1 += gF1;
160
+ totalBaselineEM += bEM; totalGraphragEM += gEM;
161
+ totalBaselineTokens += baseResp.totalTokens; totalGraphragTokens += gTokens;
162
+ totalBaselineCost += baseResp.costUsd; totalGraphragCost += gCost;
163
+ totalBaselineLatency += baseLat; totalGraphragLatency += graphLat;
164
+ if (sample.type === "bridge") { bridgeCount++; bridgeBaseF1 += bF1; bridgeGraphF1 += gF1; }
165
+ else { compCount++; compBaseF1 += bF1; compGraphF1 += gF1; }
166
+ } catch (err) {
167
+ console.error(`Benchmark query ${i} failed:`, err);
168
+ }
169
+ }
170
+
171
+ const n = results.length || 1;
172
+ const winRate = results.filter(r => (r.graphrag_f1 as number) > (r.baseline_f1 as number)).length / n;
173
+
174
+ return NextResponse.json({
175
+ results,
176
+ aggregate: {
177
+ numSamples: results.length,
178
+ baseline: {
179
+ avgF1: +(totalBaselineF1 / n).toFixed(4),
180
+ avgEM: +(totalBaselineEM / n).toFixed(4),
181
+ avgTokens: Math.round(totalBaselineTokens / n),
182
+ avgCost: +(totalBaselineCost / n).toFixed(6),
183
+ avgLatency: Math.round(totalBaselineLatency / n),
184
+ },
185
+ graphrag: {
186
+ avgF1: +(totalGraphragF1 / n).toFixed(4),
187
+ avgEM: +(totalGraphragEM / n).toFixed(4),
188
+ avgTokens: Math.round(totalGraphragTokens / n),
189
+ avgCost: +(totalGraphragCost / n).toFixed(6),
190
+ avgLatency: Math.round(totalGraphragLatency / n),
191
+ },
192
+ graphragF1WinRate: +winRate.toFixed(4),
193
+ byType: {
194
+ bridge: bridgeCount > 0 ? { count: bridgeCount, baselineF1: +(bridgeBaseF1/bridgeCount).toFixed(4), graphragF1: +(bridgeGraphF1/bridgeCount).toFixed(4) } : null,
195
+ comparison: compCount > 0 ? { count: compCount, baselineF1: +(compBaseF1/compCount).toFixed(4), graphragF1: +(compGraphF1/compCount).toFixed(4) } : null,
196
+ },
197
+ },
198
+ provider, model: model || PROVIDERS[provider]?.defaultModel,
199
+ demoMode: !hasKey,
200
+ });
201
+ }