Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

App Files Files Community

HeshamHaroon commited on 15 days ago

Commit

8c455b2

verified ·

1 Parent(s): a18529f

Update: Auto-evaluation on Space startup

Browse files

Files changed (1) hide show

afcl/app.py +199 -32

afcl/app.py CHANGED Viewed

@@ -205,19 +205,51 @@ def load_evaluation_dataset():
 def create_prompt(query: str, functions: List[Dict]) -> str:
-    """Create evaluation prompt."""
-    func_desc = "You are a function calling AI. Respond with JSON only.\n\nFunctions:\n"
-    for f in functions:
-        func_desc += f"- {f.get('name')}: {f.get('description', '')}\n"
-    return f"""{func_desc}
-Query: {query}
-Response format: {{"name": "function_name", "arguments": {{"key": "value"}}}}
-If no function applies: {{"name": null, "arguments": {{}}}}
-JSON:"""
 def call_model(model_id: str, prompt: str) -> str:
@@ -242,26 +274,113 @@ def call_model(model_id: str, prompt: str) -> str:
 def parse_response(response: str) -> Optional[Dict]:
-    """Parse function call from response."""
     if not response:
         return None
     try:
-        return json.loads(response.strip())
     except:
         pass
-    match = re.search(r'\{[^{}]*"name"[^{}]*\}', response)
-    if match:
-        try:
-            return json.loads(match.group())
-        except:
-            pass
-    if any(x in response.lower() for x in ['null', 'none', 'لا يمكن']):
-        return {"name": None}
     return None
 def evaluate_sample(model_id: str, sample: Dict) -> float:
-    """Evaluate single sample."""
     query = sample.get('query_ar', '')
     functions = sample.get('functions', [])
     category = sample.get('category', '')
@@ -271,24 +390,72 @@ def evaluate_sample(model_id: str, sample: Dict) -> float:
     response = call_model(model_id, prompt)
     parsed = parse_response(response)
     if category == 'irrelevance':
-        return 1.0 if (parsed is None or parsed.get('name') is None) else 0.0
-    if not ground_truth or not parsed:
         return 0.0
-    expected = ground_truth.get('calls', [ground_truth])[0] if isinstance(ground_truth, dict) else ground_truth
-    if str(parsed.get('name', '')).lower() != str(expected.get('name', '')).lower():
         return 0.0
-    pred_args = parsed.get('arguments', {})
-    exp_args = expected.get('arguments', {})
-    if not exp_args:
-        return 1.0
-    matched = sum(1 for k, v in exp_args.items() if str(pred_args.get(k, '')).lower() == str(v).lower())
-    return matched / len(exp_args)
 def run_evaluation():

 def create_prompt(query: str, functions: List[Dict]) -> str:
+    """Create evaluation prompt in Arabic with full function details."""
+    # Arabic system prompt
+    prompt = """أنت مساعد ذكي متخصص في استدعاء الدوال البرمجية. مهمتك هي تحليل طلب المستخدم واختيار الدالة المناسبة مع تحديد المعاملات الصحيحة.
+### الدوال المتاحة:
+"""
+    for f in functions:
+        func_name = f.get('name', '')
+        func_desc = f.get('description', 'لا يوجد وصف')
+        prompt += f"**{func_name}**\n"
+        prompt += f"الوصف: {func_desc}\n"
+        if 'parameters' in f:
+            params = f['parameters']
+            if 'properties' in params:
+                prompt += "المعاملات:\n"
+                required_params = params.get('required', [])
+                for param_name, param_info in params['properties'].items():
+                    param_type = param_info.get('type', 'any')
+                    param_desc = param_info.get('description', '')
+                    is_required = param_name in required_params
+                    req_str = " (مطلوب)" if is_required else " (اختياري)"
+                    prompt += f"  • {param_name} ({param_type}){req_str}: {param_desc}\n"
+        prompt += "\n"
+    prompt += f"""### طلب المستخدم:
+{query}
+### التعليمات:
+1. حلل طلب المستخدم بعناية
+2. اختر الدالة المناسبة من القائمة أعلاه
+3. استخرج قيم المعاملات من الطلب
+4. أجب بصيغة JSON فقط
+### صيغة الإجابة:
+إذا كانت هناك دالة مناسبة:
+{{"name": "اسم_الدالة", "arguments": {{"المعامل1": "القيمة1", "المعامل2": "القيمة2"}}}}
+إذا لم تكن هناك دالة مناسبة للطلب:
+{{"name": null, "arguments": {{}}}}
+### الإجابة (JSON فقط):
+"""
+    return prompt
 def call_model(model_id: str, prompt: str) -> str:
 def parse_response(response: str) -> Optional[Dict]:
+    """Parse function call from response with robust extraction."""
     if not response:
         return None
+    # Clean up response
+    response = response.strip()
+    # Try direct JSON parse first
     try:
+        data = json.loads(response)
+        if isinstance(data, dict):
+            return data
     except:
         pass
+    # Try to find JSON block (handles markdown code blocks)
+    json_patterns = [
+        r'```json\s*([\s\S]*?)\s*```',  # ```json ... ```
+        r'```\s*([\s\S]*?)\s*```',       # ``` ... ```
+        r'(\{[\s\S]*\})',                 # Any JSON object
+    ]
+    for pattern in json_patterns:
+        matches = re.findall(pattern, response)
+        for match in matches:
+            try:
+                data = json.loads(match.strip())
+                if isinstance(data, dict) and 'name' in data:
+                    return data
+            except:
+                continue
+    # Try to extract JSON starting from first {
+    start_idx = response.find('{')
+    if start_idx != -1:
+        # Find matching closing brace
+        brace_count = 0
+        for i, char in enumerate(response[start_idx:], start_idx):
+            if char == '{':
+                brace_count += 1
+            elif char == '}':
+                brace_count -= 1
+                if brace_count == 0:
+                    try:
+                        json_str = response[start_idx:i+1]
+                        data = json.loads(json_str)
+                        if isinstance(data, dict):
+                            return data
+                    except:
+                        pass
+                    break
+    # Check for explicit "no function" indicators
+    no_call_patterns = [
+        'no function', 'cannot', 'لا يمكن', 'لا توجد',
+        'null', 'none', 'not applicable', 'غير متاح',
+        'لا يوجد', 'no matching', 'no relevant'
+    ]
+    response_lower = response.lower()
+    if any(p in response_lower for p in no_call_patterns):
+        return {"name": None, "arguments": {}}
     return None
+def normalize_arabic(text: str) -> str:
+    """Normalize Arabic text for comparison."""
+    if not text:
+        return ""
+    text = str(text)
+    # Remove diacritics (tashkeel)
+    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
+    # Normalize alef variants
+    text = re.sub(r'[إأآا]', 'ا', text)
+    # Normalize taa marbuta
+    text = text.replace('ة', 'ه')
+    # Normalize yaa
+    text = text.replace('ى', 'ي')
+    # Lowercase and strip
+    return text.lower().strip()
+def compare_values(pred_val, exp_val) -> bool:
+    """Compare two values with Arabic normalization."""
+    pred_str = normalize_arabic(str(pred_val))
+    exp_str = normalize_arabic(str(exp_val))
+    # Exact match after normalization
+    if pred_str == exp_str:
+        return True
+    # Try numeric comparison
+    try:
+        if float(pred_val) == float(exp_val):
+            return True
+    except:
+        pass
+    # Check if one contains the other (for partial matches)
+    if pred_str in exp_str or exp_str in pred_str:
+        return True
+    return False
 def evaluate_sample(model_id: str, sample: Dict) -> float:
+    """Evaluate single sample with robust comparison."""
     query = sample.get('query_ar', '')
     functions = sample.get('functions', [])
     category = sample.get('category', '')
     response = call_model(model_id, prompt)
     parsed = parse_response(response)
+    # Handle irrelevance category - should NOT call any function
     if category == 'irrelevance':
+        if parsed is None:
+            return 1.0  # Correct - no valid response
+        if parsed.get('name') is None or parsed.get('name') == 'null':
+            return 1.0  # Correct - explicitly said no function
+        return 0.0  # Wrong - called a function when shouldn't
+    # For other categories, need valid response
+    if not parsed:
+        return 0.0
+    if not ground_truth:
         return 0.0
+    # Get expected function call
+    expected = ground_truth
+    if isinstance(ground_truth, dict) and 'calls' in ground_truth:
+        calls = ground_truth.get('calls', [])
+        if calls:
+            expected = calls[0]
+        else:
+            expected = ground_truth
+    # Compare function names
+    pred_name = normalize_arabic(str(parsed.get('name', '')))
+    exp_name = normalize_arabic(str(expected.get('name', '')))
+    if not pred_name or not exp_name:
         return 0.0
+    if pred_name != exp_name:
+        # Try partial match for function names
+        if pred_name not in exp_name and exp_name not in pred_name:
+            return 0.0
+    # Function name matched - now check arguments
+    pred_args = parsed.get('arguments', {}) or {}
+    exp_args = expected.get('arguments', {}) or {}
+    if not exp_args:
+        return 1.0  # No arguments expected, name matched = success
+    if not pred_args:
+        return 0.5  # Name matched but no arguments provided
+    # Compare arguments
+    matched = 0
+    total = len(exp_args)
+    for key, exp_val in exp_args.items():
+        # Try exact key match first
+        if key in pred_args:
+            if compare_values(pred_args[key], exp_val):
+                matched += 1
+                continue
+        # Try normalized key match
+        norm_key = normalize_arabic(key)
+        for pred_key, pred_val in pred_args.items():
+            if normalize_arabic(pred_key) == norm_key:
+                if compare_values(pred_val, exp_val):
+                    matched += 1
+                    break
+    return matched / total if total > 0 else 1.0
 def run_evaluation():