Spaces:
Sleeping
Sleeping
Merge remote-tracking branch 'github/main'
Browse files# Conflicts:
# README.md
# backend/evaluation/ablation_chunk_size.py
# backend/evaluation/backbone_comparison.py
# backend/evaluation/baseline_runner.py
# backend/evaluation/benchmark_runner.py
# backend/evaluation/metrics.py
- README.md +0 -1
- backend/evaluation/baseline_runner.py +0 -3
- backend/evaluation/metrics.py +1 -1
README.md
CHANGED
|
@@ -7,7 +7,6 @@ sdk: docker
|
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
---
|
| 10 |
-
|
| 11 |
# MEXAR Ultimate 🧠
|
| 12 |
|
| 13 |
**Multimodal Explainable AI Reasoning Assistant**
|
|
|
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
---
|
|
|
|
| 10 |
# MEXAR Ultimate 🧠
|
| 11 |
|
| 12 |
**Multimodal Explainable AI Reasoning Assistant**
|
backend/evaluation/baseline_runner.py
CHANGED
|
@@ -4,13 +4,11 @@ Runs CRAG and RAPTOR baselines against a set of test queries.
|
|
| 4 |
import sys
|
| 5 |
import os
|
| 6 |
from typing import Dict, List, Optional
|
| 7 |
-
|
| 8 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 9 |
|
| 10 |
from modules.reasoning_engine import create_reasoning_engine
|
| 11 |
from evaluation.metrics import MetricsRunner
|
| 12 |
|
| 13 |
-
|
| 14 |
def _append_score(results: Dict[str, List[float]], baseline: str, score: Optional[float]) -> None:
|
| 15 |
if score is None:
|
| 16 |
print(f"{baseline}: Faithfulness score unavailable for this query.")
|
|
@@ -66,7 +64,6 @@ def run_baselines(agent_name: str, queries: List[str]):
|
|
| 66 |
|
| 67 |
return results
|
| 68 |
|
| 69 |
-
|
| 70 |
if __name__ == "__main__":
|
| 71 |
# Example usage
|
| 72 |
test_queries = [
|
|
|
|
| 4 |
import sys
|
| 5 |
import os
|
| 6 |
from typing import Dict, List, Optional
|
|
|
|
| 7 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 8 |
|
| 9 |
from modules.reasoning_engine import create_reasoning_engine
|
| 10 |
from evaluation.metrics import MetricsRunner
|
| 11 |
|
|
|
|
| 12 |
def _append_score(results: Dict[str, List[float]], baseline: str, score: Optional[float]) -> None:
|
| 13 |
if score is None:
|
| 14 |
print(f"{baseline}: Faithfulness score unavailable for this query.")
|
|
|
|
| 64 |
|
| 65 |
return results
|
| 66 |
|
|
|
|
| 67 |
if __name__ == "__main__":
|
| 68 |
# Example usage
|
| 69 |
test_queries = [
|
backend/evaluation/metrics.py
CHANGED
|
@@ -24,7 +24,7 @@ class MetricsRunner:
|
|
| 24 |
return {
|
| 25 |
"faithfulness": faith_res.score,
|
| 26 |
"bart_nli": bart_res.score,
|
| 27 |
-
"factscore": fact_res.score
|
| 28 |
}
|
| 29 |
|
| 30 |
def extract_faithfulness(self, response: Dict[str, Any]) -> Optional[float]:
|
|
|
|
| 24 |
return {
|
| 25 |
"faithfulness": faith_res.score,
|
| 26 |
"bart_nli": bart_res.score,
|
| 27 |
+
"factscore": fact_res.score,
|
| 28 |
}
|
| 29 |
|
| 30 |
def extract_faithfulness(self, response: Dict[str, Any]) -> Optional[float]:
|