Spaces:

Svngoku
/

afrimmlu-iroko-bench-deepseek

Running

App Files Files Community

Svngoku commited on 28 days ago

Commit

830bf75

verified ·

1 Parent(s): bdaf5da

Create app.py

Browse files

Files changed (1) hide show

app.py +206 -0

app.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import ast
+import pandas as pd
+import gradio as gr
+import litellm
+import plotly.express as px
+from collections import defaultdict
+from datetime import datetime
+def preprocess_dataset(test_data):
+    """
+    Preprocess the dataset to convert the 'choices' field from a string to a list of strings.
+    """
+    preprocessed_data = []
+    for example in test_data:
+        if isinstance(example['choices'], str):
+            choices_str = example['choices']
+            if choices_str.startswith("'") and choices_str.endswith("'"):
+                choices_str = choices_str[1:-1]
+            elif choices_str.startswith('"') and choices_str.endswith('"'):
+                choices_str = choices_str[1:-1]
+            choices_str = choices_str.replace("\\'", "'")
+            try:
+                example['choices'] = ast.literal_eval(choices_str)
+            except (ValueError, SyntaxError):
+                print(f"Error parsing choices: {choices_str}")
+                continue
+        preprocessed_data.append(example)
+    return preprocessed_data
+def evaluate_afrimmlu(test_data, model_name="deepseek-chat"):
+    """
+    Evaluate the model on the AfriMMLU dataset.
+    """
+    results = []
+    correct = 0
+    total = 0
+    subject_results = defaultdict(lambda: {"correct": 0, "total": 0})
+    for example in test_data:
+        question = example['question']
+        choices = example['choices']
+        answer = example['answer']
+        subject = example['subject']
+        prompt = (
+            f"Answer the following multiple-choice question. "
+            f"Return only the letter corresponding to the correct answer (A, B, C, or D).\n"
+            f"Question: {question}\n"
+            f"Options:\n"
+            f"A. {choices[0]}\n"
+            f"B. {choices[1]}\n"
+            f"C. {choices[2]}\n"
+            f"D. {choices[3]}\n"
+            f"Answer:"
+        )
+        try:
+            response = litellm.completion(
+                model=model_name,
+                messages=[{"role": "user", "content": prompt}]
+            )
+            model_output = response.choices[0].message.content.strip().upper()
+            model_answer = None
+            for char in model_output:
+                if char in ['A', 'B', 'C', 'D']:
+                    model_answer = char
+                    break
+            is_correct = model_answer == answer.upper()
+            if is_correct:
+                correct += 1
+                subject_results[subject]["correct"] += 1
+            total += 1
+            subject_results[subject]["total"] += 1
+            # Store detailed results
+            results.append({
+                'timestamp': datetime.now().isoformat(),
+                'subject': subject,
+                'question': question,
+                'model_answer': model_answer,
+                'correct_answer': answer.upper(),
+                'is_correct': is_correct,
+                'total_tokens': response.usage.total_tokens
+            })
+        except Exception as e:
+            print(f"Error processing question: {str(e)}")
+            continue
+    # Calculate accuracies
+    accuracy = (correct / total * 100) if total > 0 else 0
+    subject_accuracy = {
+        subject: (stats["correct"] / stats["total"] * 100) if stats["total"] > 0 else 0
+        for subject, stats in subject_results.items()
+    }
+    # Export results to CSV
+    df = pd.DataFrame(results)
+    df.to_csv('detailed_results.csv', index=False)
+    # Export summary to CSV
+    summary_data = [{'subject': subject, 'accuracy': acc}
+                   for subject, acc in subject_accuracy.items()]
+    summary_data.append({'subject': 'Overall', 'accuracy': accuracy})
+    pd.DataFrame(summary_data).to_csv('summary_results.csv', index=False)
+    return {
+        "accuracy": accuracy,
+        "subject_accuracy": subject_accuracy,
+        "detailed_results": results
+    }
+def create_visualization(results_dict):
+    """
+    Create visualization from evaluation results.
+    """
+    summary_data = [
+        {'Subject': subject, 'Accuracy (%)': accuracy}
+        for subject, accuracy in results_dict['subject_accuracy'].items()
+    ]
+    summary_data.append({'Subject': 'Overall', 'Accuracy (%)': results_dict['accuracy']})
+    summary_df = pd.DataFrame(summary_data)
+    fig = px.bar(
+        summary_df,
+        x='Subject',
+        y='Accuracy (%)',
+        title='AfriMMLU Evaluation Results',
+        labels={'Subject': 'Subject', 'Accuracy (%)': 'Accuracy (%)'}
+    )
+    fig.update_layout(
+        xaxis_tickangle=-45,
+        showlegend=False,
+        height=600
+    )
+    return summary_df, fig
+def evaluate_and_display(test_file, model_name):
+    """
+    Process uploaded file and run evaluation.
+    """
+    test_data = pd.read_json(test_file.name)
+    preprocessed_data = preprocess_dataset(test_data.to_dict('records'))
+    results = evaluate_afrimmlu(preprocessed_data, model_name)
+    summary_df, plot = create_visualization(results)
+    detailed_df = pd.read_csv('detailed_results.csv')
+    return summary_df, plot, detailed_df
+def create_gradio_interface():
+    """
+    Create and configure the Gradio interface.
+    """
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # AfriMMLU Evaluation Dashboard
+        Upload your test data and select a model to evaluate performance on the AfriMMLU benchmark.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_input = gr.File(
+                    label="Upload Test Data (JSON)",
+                    file_types=[".json"]
+                )
+                model_input = gr.Dropdown(
+                    choices=["deepseek-chat", "gpt-3.5-turbo", "gpt-4"],
+                    label="Select Model",
+                    value="deepseek-chat"
+                )
+                evaluate_btn = gr.Button("Evaluate", variant="primary")
+        with gr.Row():
+            with gr.Column():
+                summary_table = gr.Dataframe(
+                    headers=["Subject", "Accuracy (%)"],
+                    label="Summary Results"
+                )
+        with gr.Row():
+            with gr.Column():
+                summary_plot = gr.Plot(label="Performance by Subject")
+        with gr.Row():
+            with gr.Column():
+                detailed_results = gr.Dataframe(
+                    label="Detailed Results",
+                    wrap=True
+                )
+        evaluate_btn.click(
+            fn=evaluate_and_display,
+            inputs=[file_input, model_input],
+            outputs=[summary_table, summary_plot, detailed_results]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_gradio_interface()
+    demo.launch(share=True)