Spaces:
Sleeping
Sleeping
arjunanand13
commited on
Commit
•
e1175ed
1
Parent(s):
ababf21
Update app.py
Browse files
app.py
CHANGED
@@ -47,38 +47,60 @@ class RAGEvaluator:
|
|
47 |
self.current_dataset = None
|
48 |
self.test_samples = []
|
49 |
|
50 |
-
def load_dataset(self, dataset_name: str, num_samples: int =
|
51 |
-
"""Load a smaller subset of questions"""
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
def evaluate_configuration(self, vector_db, qa_chain, splitting_strategy: str, chunk_size: str) -> Dict:
|
81 |
-
"""Evaluate with progress tracking"""
|
82 |
if not self.test_samples:
|
83 |
return {"error": "No dataset loaded"}
|
84 |
|
@@ -105,11 +127,17 @@ class RAGEvaluator:
|
|
105 |
print(f"Error processing question {i+1}: {str(e)}")
|
106 |
continue
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
112 |
try:
|
|
|
|
|
|
|
113 |
scores = evaluate(eval_dataset, metrics=metrics)
|
114 |
|
115 |
return {
|
@@ -417,12 +445,25 @@ def demo():
|
|
417 |
)
|
418 |
|
419 |
def load_dataset_handler(dataset_name):
|
420 |
-
|
421 |
-
|
422 |
-
"
|
423 |
-
|
424 |
-
|
425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
def run_evaluation(dataset_choice, splitting_strategy, chunk_size, vector_db, qa_chain):
|
428 |
if not evaluator.current_dataset:
|
|
|
47 |
self.current_dataset = None
|
48 |
self.test_samples = []
|
49 |
|
50 |
+
def load_dataset(self, dataset_name: str, num_samples: int = 10):
|
51 |
+
"""Load a smaller subset of questions with proper error handling"""
|
52 |
+
try:
|
53 |
+
if dataset_name == "squad":
|
54 |
+
dataset = load_dataset("squad_v2", split="validation")
|
55 |
+
# Select diverse questions
|
56 |
+
samples = dataset.select(range(0, 1000, 100))[:num_samples]
|
57 |
+
|
58 |
+
self.test_samples = []
|
59 |
+
for sample in samples:
|
60 |
+
# Check if answers exist and are not empty
|
61 |
+
if sample.get("answers") and isinstance(sample["answers"], dict) and sample["answers"].get("text"):
|
62 |
+
self.test_samples.append({
|
63 |
+
"question": sample["question"],
|
64 |
+
"ground_truth": sample["answers"]["text"][0],
|
65 |
+
"context": sample["context"]
|
66 |
+
})
|
67 |
+
|
68 |
+
elif dataset_name == "msmarco":
|
69 |
+
dataset = load_dataset("ms_marco", "v2.1", split="dev")
|
70 |
+
samples = dataset.select(range(0, 1000, 100))[:num_samples]
|
71 |
+
|
72 |
+
self.test_samples = []
|
73 |
+
for sample in samples:
|
74 |
+
# Check for valid answers
|
75 |
+
if sample.get("answers") and sample["answers"]:
|
76 |
+
self.test_samples.append({
|
77 |
+
"question": sample["query"],
|
78 |
+
"ground_truth": sample["answers"][0],
|
79 |
+
"context": sample["passages"][0]["passage_text"]
|
80 |
+
if isinstance(sample["passages"], list)
|
81 |
+
else sample["passages"]["passage_text"][0]
|
82 |
+
})
|
83 |
+
|
84 |
+
self.current_dataset = dataset_name
|
85 |
+
|
86 |
+
# Return dataset info
|
87 |
+
return {
|
88 |
+
"dataset": dataset_name,
|
89 |
+
"num_samples": len(self.test_samples),
|
90 |
+
"sample_questions": [s["question"] for s in self.test_samples[:3]],
|
91 |
+
"status": "success"
|
92 |
+
}
|
93 |
+
|
94 |
+
except Exception as e:
|
95 |
+
print(f"Error loading dataset: {str(e)}")
|
96 |
+
return {
|
97 |
+
"dataset": dataset_name,
|
98 |
+
"error": str(e),
|
99 |
+
"status": "failed"
|
100 |
+
}
|
101 |
|
102 |
def evaluate_configuration(self, vector_db, qa_chain, splitting_strategy: str, chunk_size: str) -> Dict:
|
103 |
+
"""Evaluate with progress tracking and error handling"""
|
104 |
if not self.test_samples:
|
105 |
return {"error": "No dataset loaded"}
|
106 |
|
|
|
127 |
print(f"Error processing question {i+1}: {str(e)}")
|
128 |
continue
|
129 |
|
130 |
+
if not results:
|
131 |
+
return {
|
132 |
+
"configuration": f"{splitting_strategy}_{chunk_size}",
|
133 |
+
"error": "No successful evaluations",
|
134 |
+
"questions_evaluated": 0
|
135 |
+
}
|
136 |
+
|
137 |
try:
|
138 |
+
# Calculate RAGAS metrics
|
139 |
+
eval_dataset = Dataset.from_list(results)
|
140 |
+
metrics = [ContextRecall(), AnswerRelevancy(), Faithfulness(), ContextPrecision()]
|
141 |
scores = evaluate(eval_dataset, metrics=metrics)
|
142 |
|
143 |
return {
|
|
|
445 |
)
|
446 |
|
447 |
def load_dataset_handler(dataset_name):
|
448 |
+
try:
|
449 |
+
result = evaluator.load_dataset(dataset_name)
|
450 |
+
if result.get("status") == "success":
|
451 |
+
return {
|
452 |
+
"dataset": result["dataset"],
|
453 |
+
"samples_loaded": result["num_samples"],
|
454 |
+
"example_questions": result["sample_questions"],
|
455 |
+
"status": "ready for evaluation"
|
456 |
+
}
|
457 |
+
else:
|
458 |
+
return {
|
459 |
+
"error": result.get("error", "Unknown error occurred"),
|
460 |
+
"status": "failed to load dataset"
|
461 |
+
}
|
462 |
+
except Exception as e:
|
463 |
+
return {
|
464 |
+
"error": str(e),
|
465 |
+
"status": "failed to load dataset"
|
466 |
+
}
|
467 |
|
468 |
def run_evaluation(dataset_choice, splitting_strategy, chunk_size, vector_db, qa_chain):
|
469 |
if not evaluator.current_dataset:
|