Update app.py
Browse files
app.py
CHANGED
@@ -4,9 +4,6 @@ import os
|
|
4 |
import re
|
5 |
from datetime import datetime
|
6 |
|
7 |
-
# Leaderboard Data (example CSV file for leaderboard)
|
8 |
-
LEADERBOARD_FILE = "leaderboard.csv"
|
9 |
-
|
10 |
def clean_answer(answer):
|
11 |
if pd.isna(answer):
|
12 |
return None
|
@@ -18,21 +15,38 @@ def clean_answer(answer):
|
|
18 |
return first_letter
|
19 |
return None
|
20 |
|
21 |
-
def
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
"
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
def evaluate_predictions(prediction_file):
|
38 |
ground_truth_file = "ground_truth.csv" # Specify the path to the ground truth file
|
@@ -70,7 +84,6 @@ def evaluate_predictions(prediction_file):
|
|
70 |
total_predictions = len(merged_df)
|
71 |
total_valid_predictions = len(valid_predictions)
|
72 |
|
73 |
-
# Ensure no division by zero
|
74 |
overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
|
75 |
valid_accuracy = (
|
76 |
correct_predictions / total_valid_predictions
|
@@ -114,30 +127,21 @@ def evaluate_predictions(prediction_file):
|
|
114 |
except Exception as e:
|
115 |
return f"Error during evaluation: {str(e)}", None
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
eval_results_file = gr.File(label="Download Evaluation Results")
|
133 |
-
eval_button = gr.Button("Evaluate")
|
134 |
-
eval_button.click(
|
135 |
-
evaluate_predictions, inputs=file_input, outputs=[eval_status, eval_results_file]
|
136 |
-
)
|
137 |
-
with gr.Tab("Leaderboard"):
|
138 |
-
leaderboard_text = gr.Textbox(label="Leaderboard", interactive=False)
|
139 |
-
refresh_button = gr.Button("Refresh Leaderboard")
|
140 |
-
refresh_button.click(display_leaderboard, outputs=leaderboard_text)
|
141 |
|
142 |
if __name__ == "__main__":
|
143 |
demo.launch()
|
|
|
4 |
import re
|
5 |
from datetime import datetime
|
6 |
|
|
|
|
|
|
|
7 |
def clean_answer(answer):
|
8 |
if pd.isna(answer):
|
9 |
return None
|
|
|
15 |
return first_letter
|
16 |
return None
|
17 |
|
18 |
+
def write_evaluation_results(results, output_file):
|
19 |
+
os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
|
20 |
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
21 |
+
|
22 |
+
output_text = [
|
23 |
+
f"Evaluation Results for Model: {results['model_name']}",
|
24 |
+
f"Timestamp: {timestamp}",
|
25 |
+
"-" * 50,
|
26 |
+
f"Overall Accuracy (including invalid): {results['overall_accuracy']:.2%}",
|
27 |
+
f"Accuracy (valid predictions only): {results['valid_accuracy']:.2%}",
|
28 |
+
f"Total Questions: {results['total_questions']}",
|
29 |
+
f"Valid Predictions: {results['valid_predictions']}",
|
30 |
+
f"Invalid/Malformed Predictions: {results['invalid_predictions']}",
|
31 |
+
f"Correct Predictions: {results['correct_predictions']}",
|
32 |
+
"\nPerformance by Field:",
|
33 |
+
"-" * 50
|
34 |
+
]
|
35 |
+
|
36 |
+
for field, metrics in results['field_performance'].items():
|
37 |
+
field_results = [
|
38 |
+
f"\nField: {field}",
|
39 |
+
f"Accuracy (including invalid): {metrics['accuracy']:.2%}",
|
40 |
+
f"Accuracy (valid only): {metrics['valid_accuracy']:.2%}",
|
41 |
+
f"Correct: {metrics['correct']}/{metrics['total']}",
|
42 |
+
f"Invalid predictions: {metrics['invalid']}"
|
43 |
+
]
|
44 |
+
output_text.extend(field_results)
|
45 |
+
|
46 |
+
with open(output_file, 'w') as f:
|
47 |
+
f.write('\n'.join(output_text))
|
48 |
+
print('\n'.join(output_text))
|
49 |
+
print(f"\nResults have been saved to: {output_file}")
|
50 |
|
51 |
def evaluate_predictions(prediction_file):
|
52 |
ground_truth_file = "ground_truth.csv" # Specify the path to the ground truth file
|
|
|
84 |
total_predictions = len(merged_df)
|
85 |
total_valid_predictions = len(valid_predictions)
|
86 |
|
|
|
87 |
overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
|
88 |
valid_accuracy = (
|
89 |
correct_predictions / total_valid_predictions
|
|
|
127 |
except Exception as e:
|
128 |
return f"Error during evaluation: {str(e)}", None
|
129 |
|
130 |
+
# Gradio Interface
|
131 |
+
description = "Upload a prediction CSV file to evaluate predictions against the ground truth stored in the system."
|
132 |
+
|
133 |
+
demo = gr.Interface(
|
134 |
+
fn=evaluate_predictions,
|
135 |
+
inputs=[
|
136 |
+
gr.File(label="Upload Prediction CSV")
|
137 |
+
],
|
138 |
+
outputs=[
|
139 |
+
gr.Textbox(label="Evaluation Status"),
|
140 |
+
gr.File(label="Download Evaluation Results")
|
141 |
+
],
|
142 |
+
title="Prediction Evaluation Tool",
|
143 |
+
description=description
|
144 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
if __name__ == "__main__":
|
147 |
demo.launch()
|