MTBenchExplorer / app.py
RVikas's picture
Update app.py
8f4e737 verified
import streamlit as st
import json
import pandas as pd
import plotly.express as px
# Define categories
CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
# Load and process the single model data
@st.cache_resource
def get_model_df():
q2result = []
# Replace "gpt-4_single.jsonl" with the actual path to your JSONL file
with open("data/gpt-4_single.jsonl", "r") as fin:
for line in fin:
obj = json.loads(line)
obj["category"] = CATEGORIES[(obj["question_id"] - 81) // 10]
q2result.append(obj)
df = pd.DataFrame(q2result)
return df
# Placeholder for the pair model data function
# Adapt this function based on how your "gpt-4_pair.jsonl" is structured
@st.cache_resource
def get_model_df_pair():
# Implement similar to get_model_df if you have pair data
return pd.DataFrame([]) # Placeholder
df = get_model_df()
df_pair = get_model_df_pair()
# Streamlit app starts here
st.title('Model Performance Visualization')
# Select models to display
all_models = df["model"].unique()
selected_models = st.multiselect('Select Models', all_models, default=all_models[:3])
# Main app logic
if selected_models:
scores_all = []
for model in selected_models:
for cat in CATEGORIES:
res = df[(df["category"] == cat) & (df["model"] == model) & (df["score"] >= 0)]
score = res["score"].mean()
scores_all.append({"model": model, "category": cat, "score": score})
df_score = pd.DataFrame(scores_all)
# Renaming models for better visualization
rename_map = {
# Define your renaming map here, if needed
}
df_score.replace(rename_map, inplace=True)
# Generate the radial graph
fig = px.line_polar(df_score, r='score', theta='category', line_close=True,
category_orders={"category": CATEGORIES}, color='model', markers=True)
# Display the Plotly figure in Streamlit
st.plotly_chart(fig)