shezamunir
commited on
updated values and added ranking, new fields
Browse files- app.py +133 -27
- tiered_models_data.csv +23 -0
app.py
CHANGED
@@ -5,7 +5,7 @@ from PIL import Image
|
|
5 |
# Set up page config
|
6 |
st.set_page_config(
|
7 |
page_title="FactBench Leaderboard",
|
8 |
-
|
9 |
)
|
10 |
|
11 |
# Load the image
|
@@ -81,17 +81,82 @@ st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</di
|
|
81 |
st.markdown('</div>', unsafe_allow_html=True)
|
82 |
|
83 |
# Load the data
|
84 |
-
data_path = "factbench_data.csv"
|
|
|
85 |
df = pd.read_csv(data_path)
|
86 |
|
87 |
# Create tabs
|
88 |
tab1, tab2, tab3 = st.tabs(
|
89 |
-
["Leaderboard", "Benchmark Details", "Submit
|
90 |
|
91 |
# Tab 1: Leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
with tab1:
|
93 |
-
st.markdown('<div class="title">Leaderboard</div>',
|
94 |
-
unsafe_allow_html=True)
|
95 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
96 |
|
97 |
# Dropdown menu to filter tiers
|
@@ -100,21 +165,51 @@ with tab1:
|
|
100 |
|
101 |
# Filter the data based on the selected tier
|
102 |
if selected_tier != 'All Tiers':
|
103 |
-
filtered_df = df[df['
|
104 |
else:
|
105 |
filtered_df = df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
# Create HTML for the table
|
108 |
html = '''
|
109 |
<table>
|
110 |
<thead>
|
111 |
<tr>
|
|
|
112 |
<th>Tier</th>
|
113 |
<th>Model</th>
|
114 |
-
<th>
|
115 |
-
<th>
|
116 |
-
<th>
|
117 |
-
<th>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
</tr>
|
119 |
</thead>
|
120 |
<tbody>
|
@@ -122,27 +217,39 @@ with tab1:
|
|
122 |
|
123 |
# Generate the rows of the table
|
124 |
current_tier = None
|
125 |
-
for i, row in
|
126 |
-
if row['
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
else:
|
133 |
-
|
134 |
-
|
|
|
135 |
# Fill in model and scores
|
136 |
html += f'''
|
137 |
-
<td>{row['
|
138 |
-
<td>{row['
|
139 |
-
<td>{row['
|
140 |
-
<td>{row['
|
141 |
-
<td>{row['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
</tr>
|
143 |
'''
|
144 |
|
145 |
-
# Close the
|
146 |
html += '''
|
147 |
</table>
|
148 |
'''
|
@@ -151,7 +258,6 @@ with tab1:
|
|
151 |
st.markdown(html, unsafe_allow_html=True)
|
152 |
|
153 |
st.markdown('</div>', unsafe_allow_html=True)
|
154 |
-
|
155 |
# Tab 2: Details
|
156 |
with tab2:
|
157 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
|
|
5 |
# Set up page config
|
6 |
st.set_page_config(
|
7 |
page_title="FactBench Leaderboard",
|
8 |
+
layout="wide", # Layout remains wide, but content will be centered
|
9 |
)
|
10 |
|
11 |
# Load the image
|
|
|
81 |
st.markdown('</div>', unsafe_allow_html=True)
|
82 |
|
83 |
# Load the data
|
84 |
+
# data_path = "factbench_data.csv"
|
85 |
+
data_path = "tiered_models_data.csv"
|
86 |
df = pd.read_csv(data_path)
|
87 |
|
88 |
# Create tabs
|
89 |
tab1, tab2, tab3 = st.tabs(
|
90 |
+
["Leaderboard", "Benchmark Details", "Submit your models"])
|
91 |
|
92 |
# Tab 1: Leaderboard
|
93 |
+
# with tab1:
|
94 |
+
# st.markdown('<div class="title">Leaderboard</div>',
|
95 |
+
# unsafe_allow_html=True)
|
96 |
+
# st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
97 |
+
|
98 |
+
# # Dropdown menu to filter tiers
|
99 |
+
# tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
|
100 |
+
# selected_tier = st.selectbox('Select Tier:', tiers)
|
101 |
+
|
102 |
+
# # Filter the data based on the selected tier
|
103 |
+
# if selected_tier != 'All Tiers':
|
104 |
+
# filtered_df = df[df['Tier'] == selected_tier]
|
105 |
+
# else:
|
106 |
+
# filtered_df = df
|
107 |
+
|
108 |
+
# # Create HTML for the table
|
109 |
+
# html = '''
|
110 |
+
# <table>
|
111 |
+
# <thead>
|
112 |
+
# <tr>
|
113 |
+
# <th>Tier</th>
|
114 |
+
# <th>Model</th>
|
115 |
+
# <th>FactScore</th>
|
116 |
+
# <th>SAFE</th>
|
117 |
+
# <th>Factcheck-GPT</th>
|
118 |
+
# <th>VERIFY</th>
|
119 |
+
# </tr>
|
120 |
+
# </thead>
|
121 |
+
# <tbody>
|
122 |
+
# '''
|
123 |
+
|
124 |
+
# # Generate the rows of the table
|
125 |
+
# current_tier = None
|
126 |
+
# for i, row in filtered_df.iterrows():
|
127 |
+
# if row['Tier'] != current_tier:
|
128 |
+
# if current_tier is not None:
|
129 |
+
# # Close the previous tier row
|
130 |
+
# html += ' </tr>'
|
131 |
+
# current_tier = row['Tier']
|
132 |
+
# html += f' <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
|
133 |
+
# else:
|
134 |
+
# html += ' <tr>'
|
135 |
+
|
136 |
+
# # Fill in model and scores
|
137 |
+
# html += f'''
|
138 |
+
# <td>{row['Model']}</td>
|
139 |
+
# <td>{row['FactScore']:.2f}</td>
|
140 |
+
# <td>{row['SAFE']:.2f}</td>
|
141 |
+
# <td>{row['Factcheck-GPT']:.2f}</td>
|
142 |
+
# <td>{row['VERIFY']:.2f}</td>
|
143 |
+
# </tr>
|
144 |
+
# '''
|
145 |
+
|
146 |
+
# # Close the last row and table tags
|
147 |
+
# html += '''
|
148 |
+
# </table>
|
149 |
+
# '''
|
150 |
+
|
151 |
+
# # Display the table
|
152 |
+
# st.markdown(html, unsafe_allow_html=True)
|
153 |
+
|
154 |
+
# st.markdown('</div>', unsafe_allow_html=True)
|
155 |
+
df['rank'] = df['factuality_score'].rank(
|
156 |
+
ascending=False, method='min').astype(int)
|
157 |
+
|
158 |
with tab1:
|
159 |
+
st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
|
|
|
160 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
161 |
|
162 |
# Dropdown menu to filter tiers
|
|
|
165 |
|
166 |
# Filter the data based on the selected tier
|
167 |
if selected_tier != 'All Tiers':
|
168 |
+
filtered_df = df[df['tier'] == selected_tier]
|
169 |
else:
|
170 |
filtered_df = df
|
171 |
+
# Add sorting functionality for Factuality Score
|
172 |
+
# sort_order = st.radio('Sort by Factuality Score:',
|
173 |
+
# ('Ascending', 'Descending'))
|
174 |
+
|
175 |
+
# # Sort the dataframe based on Factuality Score
|
176 |
+
# if sort_order == 'Ascending':
|
177 |
+
# filtered_df = filtered_df.sort_values(
|
178 |
+
# by='factuality_score', ascending=True)
|
179 |
+
# else:
|
180 |
+
# filtered_df = filtered_df.sort_values(
|
181 |
+
# by='factuality_score', ascending=False)
|
182 |
+
# Option to sort by Factuality Score in ascending order
|
183 |
+
sort_by_factuality = st.checkbox('Sort by Factuality Score')
|
184 |
+
|
185 |
+
# Sort the dataframe based on Factuality Score if the checkbox is selected
|
186 |
+
if sort_by_factuality:
|
187 |
+
updated_filtered_df = filtered_df.sort_values(
|
188 |
+
by='factuality_score', ascending=False)
|
189 |
+
else:
|
190 |
+
updated_filtered_df = filtered_df
|
191 |
|
192 |
# Create HTML for the table
|
193 |
html = '''
|
194 |
<table>
|
195 |
<thead>
|
196 |
<tr>
|
197 |
+
<th>Rank</th>
|
198 |
<th>Tier</th>
|
199 |
<th>Model</th>
|
200 |
+
<th>Factuality Score</th>
|
201 |
+
<th>Hallucination Score</th>
|
202 |
+
<th>Avg Tokens</th>
|
203 |
+
<th>Avg Factual Units</th>
|
204 |
+
<th>Avg Undecidable Units</th>
|
205 |
+
<th>Avg Unsupported Units</th>
|
206 |
+
<th>Factual Recall</th>
|
207 |
+
<th>Conceptual Understanding</th>
|
208 |
+
<th>Procedural Execution</th>
|
209 |
+
<th>Comparative Analysis</th>
|
210 |
+
<th>Recommendations and Insights</th>
|
211 |
+
<th>Domain-Specific Knowledge</th>
|
212 |
+
<th>Temporal Context</th>
|
213 |
</tr>
|
214 |
</thead>
|
215 |
<tbody>
|
|
|
217 |
|
218 |
# Generate the rows of the table
|
219 |
current_tier = None
|
220 |
+
for i, row in updated_filtered_df.iterrows():
|
221 |
+
# if row['tier'] != current_tier:
|
222 |
+
# if current_tier is not None:
|
223 |
+
# html += ' </tr>'
|
224 |
+
# current_tier = row['tier']
|
225 |
+
# # 7 models, change this number when more models
|
226 |
+
# html += f' <tr><td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'
|
227 |
+
# else:
|
228 |
+
# html += ' <tr>'
|
229 |
+
|
230 |
+
html += ' <tr>'
|
231 |
# Fill in model and scores
|
232 |
html += f'''
|
233 |
+
<td>{row['rank']}</td>
|
234 |
+
<td>{row['tier']}</td>
|
235 |
+
<td>{row['model']}</td>
|
236 |
+
<td>{row['factuality_score']:.2f}</td>
|
237 |
+
<td>{row['hallucination_score']:.2f}</td>
|
238 |
+
<td>{row['avg_tokens']:.2f}</td>
|
239 |
+
<td>{row['avg_factual_units']:.2f}</td>
|
240 |
+
<td>{row['avg_undecidable_units']:.2f}</td>
|
241 |
+
<td>{row['avg_unsupported_units']:.2f}</td>
|
242 |
+
<td>{row['prompt_categories.Factual Recall']:.2f}</td>
|
243 |
+
<td>{row['prompt_categories.Conceptual Understanding']:.2f}</td>
|
244 |
+
<td>{row['prompt_categories.Procedural Execution']:.2f}</td>
|
245 |
+
<td>{row['prompt_categories.Comparative Analysis']:.2f}</td>
|
246 |
+
<td>{row['prompt_categories.Recommendations and Insights']:.2f}</td>
|
247 |
+
<td>{row['prompt_categories.Domain-Specific Knowledge']:.2f}</td>
|
248 |
+
<td>{row['prompt_categories.Temporal Context']:.2f}</td>
|
249 |
</tr>
|
250 |
'''
|
251 |
|
252 |
+
# Close the table
|
253 |
html += '''
|
254 |
</table>
|
255 |
'''
|
|
|
258 |
st.markdown(html, unsafe_allow_html=True)
|
259 |
|
260 |
st.markdown('</div>', unsafe_allow_html=True)
|
|
|
261 |
# Tab 2: Details
|
262 |
with tab2:
|
263 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
tiered_models_data.csv
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tier,model,factuality_score,hallucination_score,avg_tokens,avg_factual_units,avg_undecidable_units,avg_unsupported_units,prompt_categories.Factual Recall,prompt_categories.Conceptual Understanding,prompt_categories.Procedural Execution,prompt_categories.Comparative Analysis,prompt_categories.Recommendations and Insights,prompt_categories.Domain-Specific Knowledge,prompt_categories.Temporal Context
|
2 |
+
Tier 1: Easy,gpt4-o,75.69,0.64,561.72,23.91,4.61,1.01,76.49,78.49,66.14,76.13,76.3,75.91,69.52
|
3 |
+
Tier 1: Easy,gemini,73.81,0.68,516.41,22.23,4.47,1.12,73.35,79.39,66.7,72.44,73.64,74.31,71.42
|
4 |
+
Tier 1: Easy,llama3.1_70B_instruct,70.01,0.89,531.35,27.09,5.67,2.13,68.99,75.38,64.73,70.34,70.03,70.64,56.61
|
5 |
+
Tier 1: Easy,llama3.1_405B_instruct,68.64,0.93,550.74,26.6,6.15,2.19,66.07,74.67,65.88,70.18,68.29,70.91,49.97
|
6 |
+
Tier 1: Easy,claude-3.5-sonnet,74.95,0.65,395.77,22.64,4.03,1.19,74.84,77.74,69.55,74.87,75.3,76.4,64.19
|
7 |
+
Tier 1: Easy,commandR+,73.15,0.71,440.93,23.55,4.51,1.4,69.41,80.24,68.98,74.36,73.53,73.02,66.43
|
8 |
+
Tier 1: Easy,mistral-large-2,75.19,0.67,485.58,23.21,4.09,1.36,75.87,78.32,63.98,77.17,75.5,76.38,65.8
|
9 |
+
Tier 2: Moderate,gpt4-o,80.72,0.5,624.67,24.42,3.59,0.89,80.06,84.33,72.83,79.75,81.5,81.1,70.02
|
10 |
+
Tier 2: Moderate,gemini,78.02,0.57,565.97,22.16,3.71,0.97,74.13,81.74,73.13,77.32,78.37,80.04,68.03
|
11 |
+
Tier 2: Moderate,llama3.1_70B_instruct,75.76,0.71,607.44,25.35,4.33,1.76,63.87,77.92,72.94,78.67,79.56,76.83,47.71
|
12 |
+
Tier 2: Moderate,llama3.1_405B_instruct,75.05,0.7,599.3,25.24,4.74,1.41,67.96,78.09,68.51,76.16,77.31,76.25,65.43
|
13 |
+
Tier 2: Moderate,claude-3.5-sonnet,79.92,0.54,414.32,22.15,3.32,1.09,75.88,83.52,77.39,79.31,81.06,78.81,72.47
|
14 |
+
Tier 2: Moderate,commandR+,80.71,0.52,483.32,24.1,3.17,1.09,73.49,85.46,75.6,82.97,82.12,81.61,58.49
|
15 |
+
Tier 2: Moderate,mistral-large-2,79.97,0.52,528.44,22.65,3.21,1.02,77.21,81.23,75.2,81.24,80.86,82.03,63.63
|
16 |
+
Tier 3: Hard,gpt4-o,91.63,0.26,640.84,29.29,2.01,0.53,94.31,93.62,82.98,89.19,91.86,94.12
|
17 |
+
Tier 3: Hard,gemini,89.86,0.31,551.81,25.6,1.88,0.71,92.61,90.34,83.32,87.39,90.93,95.23
|
18 |
+
Tier 3: Hard,llama3.1_70B_instruct,89.3,0.33,607.75,31.38,2.08,0.83,75.5,91.75,83.61,87.11,93.03,93.08
|
19 |
+
Tier 3: Hard,llama3.1_405B_instruct,86.57,0.4,599.87,30.12,2.88,0.85,79.58,88.92,75.23,85.11,89.2,90.21,100.0
|
20 |
+
Tier 3: Hard,claude-3.5-sonnet,89.61,0.3,411.2,26.72,1.49,0.81,89.85,92.45,75.13,86.48,91.46,91.97,100.0
|
21 |
+
Tier 3: Hard,commandR+,91.65,0.25,499.06,27.95,1.57,0.54,87.71,91.8,87.16,89.79,94.12,93.85,100.0
|
22 |
+
Tier 3: Hard,mistral-large-2,92.0,0.25,523.57,27.8,1.8,0.55,92.96,92.33,90.58,89.41,92.81,92.41,100.0
|
23 |
+
|