HeshamHaroon commited on
Commit
8c455b2
·
verified ·
1 Parent(s): a18529f

Update: Auto-evaluation on Space startup

Browse files
Files changed (1) hide show
  1. afcl/app.py +199 -32
afcl/app.py CHANGED
@@ -205,19 +205,51 @@ def load_evaluation_dataset():
205
 
206
 
207
  def create_prompt(query: str, functions: List[Dict]) -> str:
208
- """Create evaluation prompt."""
209
- func_desc = "You are a function calling AI. Respond with JSON only.\n\nFunctions:\n"
210
- for f in functions:
211
- func_desc += f"- {f.get('name')}: {f.get('description', '')}\n"
212
-
213
- return f"""{func_desc}
214
 
215
- Query: {query}
216
 
217
- Response format: {{"name": "function_name", "arguments": {{"key": "value"}}}}
218
- If no function applies: {{"name": null, "arguments": {{}}}}
219
-
220
- JSON:"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
 
223
  def call_model(model_id: str, prompt: str) -> str:
@@ -242,26 +274,113 @@ def call_model(model_id: str, prompt: str) -> str:
242
 
243
 
244
  def parse_response(response: str) -> Optional[Dict]:
245
- """Parse function call from response."""
246
  if not response:
247
  return None
 
 
 
 
 
248
  try:
249
- return json.loads(response.strip())
 
 
250
  except:
251
  pass
252
- match = re.search(r'\{[^{}]*"name"[^{}]*\}', response)
253
- if match:
254
- try:
255
- return json.loads(match.group())
256
- except:
257
- pass
258
- if any(x in response.lower() for x in ['null', 'none', 'لا يمكن']):
259
- return {"name": None}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  return None
261
 
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  def evaluate_sample(model_id: str, sample: Dict) -> float:
264
- """Evaluate single sample."""
265
  query = sample.get('query_ar', '')
266
  functions = sample.get('functions', [])
267
  category = sample.get('category', '')
@@ -271,24 +390,72 @@ def evaluate_sample(model_id: str, sample: Dict) -> float:
271
  response = call_model(model_id, prompt)
272
  parsed = parse_response(response)
273
 
 
274
  if category == 'irrelevance':
275
- return 1.0 if (parsed is None or parsed.get('name') is None) else 0.0
 
 
 
 
 
 
 
 
276
 
277
- if not ground_truth or not parsed:
278
  return 0.0
279
 
280
- expected = ground_truth.get('calls', [ground_truth])[0] if isinstance(ground_truth, dict) else ground_truth
 
 
 
 
 
 
 
 
 
 
 
281
 
282
- if str(parsed.get('name', '')).lower() != str(expected.get('name', '')).lower():
283
  return 0.0
284
 
285
- pred_args = parsed.get('arguments', {})
286
- exp_args = expected.get('arguments', {})
287
- if not exp_args:
288
- return 1.0
 
 
 
 
289
 
290
- matched = sum(1 for k, v in exp_args.items() if str(pred_args.get(k, '')).lower() == str(v).lower())
291
- return matched / len(exp_args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
 
294
  def run_evaluation():
 
205
 
206
 
207
  def create_prompt(query: str, functions: List[Dict]) -> str:
208
+ """Create evaluation prompt in Arabic with full function details."""
209
+ # Arabic system prompt
210
+ prompt = """أنت مساعد ذكي متخصص في استدعاء الدوال البرمجية. مهمتك هي تحليل طلب المستخدم واختيار الدالة المناسبة مع تحديد المعاملات الصحيحة.
 
 
 
211
 
212
+ ### الدوال المتاحة:
213
 
214
+ """
215
+ for f in functions:
216
+ func_name = f.get('name', '')
217
+ func_desc = f.get('description', 'لا يوجد وصف')
218
+ prompt += f"**{func_name}**\n"
219
+ prompt += f"الوصف: {func_desc}\n"
220
+
221
+ if 'parameters' in f:
222
+ params = f['parameters']
223
+ if 'properties' in params:
224
+ prompt += "المعاملات:\n"
225
+ required_params = params.get('required', [])
226
+ for param_name, param_info in params['properties'].items():
227
+ param_type = param_info.get('type', 'any')
228
+ param_desc = param_info.get('description', '')
229
+ is_required = param_name in required_params
230
+ req_str = " (مطلوب)" if is_required else " (اختياري)"
231
+ prompt += f" • {param_name} ({param_type}){req_str}: {param_desc}\n"
232
+ prompt += "\n"
233
+
234
+ prompt += f"""### طلب المستخدم:
235
+ {query}
236
+
237
+ ### التعليمات:
238
+ 1. حلل طلب المستخدم بعناية
239
+ 2. اختر الدالة المناسبة من القائمة أعلاه
240
+ 3. استخرج قيم المعاملات من الطلب
241
+ 4. أجب بصيغة JSON فقط
242
+
243
+ ### صيغة الإجابة:
244
+ إذا كانت هناك دالة مناسبة:
245
+ {{"name": "اسم_الدالة", "arguments": {{"المعامل1": "القيمة1", "المعامل2": "القيمة2"}}}}
246
+
247
+ إذا لم تكن هناك دالة مناسبة للطلب:
248
+ {{"name": null, "arguments": {{}}}}
249
+
250
+ ### الإجابة (JSON فقط):
251
+ """
252
+ return prompt
253
 
254
 
255
  def call_model(model_id: str, prompt: str) -> str:
 
274
 
275
 
276
  def parse_response(response: str) -> Optional[Dict]:
277
+ """Parse function call from response with robust extraction."""
278
  if not response:
279
  return None
280
+
281
+ # Clean up response
282
+ response = response.strip()
283
+
284
+ # Try direct JSON parse first
285
  try:
286
+ data = json.loads(response)
287
+ if isinstance(data, dict):
288
+ return data
289
  except:
290
  pass
291
+
292
+ # Try to find JSON block (handles markdown code blocks)
293
+ json_patterns = [
294
+ r'```json\s*([\s\S]*?)\s*```', # ```json ... ```
295
+ r'```\s*([\s\S]*?)\s*```', # ``` ... ```
296
+ r'(\{[\s\S]*\})', # Any JSON object
297
+ ]
298
+
299
+ for pattern in json_patterns:
300
+ matches = re.findall(pattern, response)
301
+ for match in matches:
302
+ try:
303
+ data = json.loads(match.strip())
304
+ if isinstance(data, dict) and 'name' in data:
305
+ return data
306
+ except:
307
+ continue
308
+
309
+ # Try to extract JSON starting from first {
310
+ start_idx = response.find('{')
311
+ if start_idx != -1:
312
+ # Find matching closing brace
313
+ brace_count = 0
314
+ for i, char in enumerate(response[start_idx:], start_idx):
315
+ if char == '{':
316
+ brace_count += 1
317
+ elif char == '}':
318
+ brace_count -= 1
319
+ if brace_count == 0:
320
+ try:
321
+ json_str = response[start_idx:i+1]
322
+ data = json.loads(json_str)
323
+ if isinstance(data, dict):
324
+ return data
325
+ except:
326
+ pass
327
+ break
328
+
329
+ # Check for explicit "no function" indicators
330
+ no_call_patterns = [
331
+ 'no function', 'cannot', 'لا يمكن', 'لا توجد',
332
+ 'null', 'none', 'not applicable', 'غير متاح',
333
+ 'لا يوجد', 'no matching', 'no relevant'
334
+ ]
335
+ response_lower = response.lower()
336
+ if any(p in response_lower for p in no_call_patterns):
337
+ return {"name": None, "arguments": {}}
338
+
339
  return None
340
 
341
 
342
+ def normalize_arabic(text: str) -> str:
343
+ """Normalize Arabic text for comparison."""
344
+ if not text:
345
+ return ""
346
+ text = str(text)
347
+ # Remove diacritics (tashkeel)
348
+ text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
349
+ # Normalize alef variants
350
+ text = re.sub(r'[إأآا]', 'ا', text)
351
+ # Normalize taa marbuta
352
+ text = text.replace('ة', 'ه')
353
+ # Normalize yaa
354
+ text = text.replace('ى', 'ي')
355
+ # Lowercase and strip
356
+ return text.lower().strip()
357
+
358
+
359
+ def compare_values(pred_val, exp_val) -> bool:
360
+ """Compare two values with Arabic normalization."""
361
+ pred_str = normalize_arabic(str(pred_val))
362
+ exp_str = normalize_arabic(str(exp_val))
363
+
364
+ # Exact match after normalization
365
+ if pred_str == exp_str:
366
+ return True
367
+
368
+ # Try numeric comparison
369
+ try:
370
+ if float(pred_val) == float(exp_val):
371
+ return True
372
+ except:
373
+ pass
374
+
375
+ # Check if one contains the other (for partial matches)
376
+ if pred_str in exp_str or exp_str in pred_str:
377
+ return True
378
+
379
+ return False
380
+
381
+
382
  def evaluate_sample(model_id: str, sample: Dict) -> float:
383
+ """Evaluate single sample with robust comparison."""
384
  query = sample.get('query_ar', '')
385
  functions = sample.get('functions', [])
386
  category = sample.get('category', '')
 
390
  response = call_model(model_id, prompt)
391
  parsed = parse_response(response)
392
 
393
+ # Handle irrelevance category - should NOT call any function
394
  if category == 'irrelevance':
395
+ if parsed is None:
396
+ return 1.0 # Correct - no valid response
397
+ if parsed.get('name') is None or parsed.get('name') == 'null':
398
+ return 1.0 # Correct - explicitly said no function
399
+ return 0.0 # Wrong - called a function when shouldn't
400
+
401
+ # For other categories, need valid response
402
+ if not parsed:
403
+ return 0.0
404
 
405
+ if not ground_truth:
406
  return 0.0
407
 
408
+ # Get expected function call
409
+ expected = ground_truth
410
+ if isinstance(ground_truth, dict) and 'calls' in ground_truth:
411
+ calls = ground_truth.get('calls', [])
412
+ if calls:
413
+ expected = calls[0]
414
+ else:
415
+ expected = ground_truth
416
+
417
+ # Compare function names
418
+ pred_name = normalize_arabic(str(parsed.get('name', '')))
419
+ exp_name = normalize_arabic(str(expected.get('name', '')))
420
 
421
+ if not pred_name or not exp_name:
422
  return 0.0
423
 
424
+ if pred_name != exp_name:
425
+ # Try partial match for function names
426
+ if pred_name not in exp_name and exp_name not in pred_name:
427
+ return 0.0
428
+
429
+ # Function name matched - now check arguments
430
+ pred_args = parsed.get('arguments', {}) or {}
431
+ exp_args = expected.get('arguments', {}) or {}
432
 
433
+ if not exp_args:
434
+ return 1.0 # No arguments expected, name matched = success
435
+
436
+ if not pred_args:
437
+ return 0.5 # Name matched but no arguments provided
438
+
439
+ # Compare arguments
440
+ matched = 0
441
+ total = len(exp_args)
442
+
443
+ for key, exp_val in exp_args.items():
444
+ # Try exact key match first
445
+ if key in pred_args:
446
+ if compare_values(pred_args[key], exp_val):
447
+ matched += 1
448
+ continue
449
+
450
+ # Try normalized key match
451
+ norm_key = normalize_arabic(key)
452
+ for pred_key, pred_val in pred_args.items():
453
+ if normalize_arabic(pred_key) == norm_key:
454
+ if compare_values(pred_val, exp_val):
455
+ matched += 1
456
+ break
457
+
458
+ return matched / total if total > 0 else 1.0
459
 
460
 
461
  def run_evaluation():