Spaces:
Running
Running
Florian Leuerer
commited on
Commit
•
2ce0e9d
1
Parent(s):
8a2da5a
added benchmarks
Browse files- app.py +38 -17
- model_stats.csv +49 -0
app.py
CHANGED
@@ -3,14 +3,17 @@ import pandas as pd
|
|
3 |
import random
|
4 |
|
5 |
df = pd.read_csv('data.csv')
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
-
models = df.columns.tolist()
|
9 |
-
print(models)
|
10 |
models.remove('hash')
|
11 |
models.remove('message')
|
12 |
messages = sorted(df['message'].tolist(), key=len)
|
13 |
-
messages_select = [(m[:
|
|
|
14 |
|
15 |
def out(message, model1, model2):
|
16 |
row = df[df['message'] == message]
|
@@ -18,24 +21,42 @@ def out(message, model1, model2):
|
|
18 |
output2 = row[model2].values[0]
|
19 |
return message, output1, output2
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
with gr.Blocks() as iface:
|
23 |
-
gr.
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
btn.click(out,
|
38 |
inputs=[drop_message, drop_model1, drop_model2],
|
39 |
outputs=[out_message, out_model1, out_model2])
|
40 |
|
41 |
-
iface.
|
|
|
3 |
import random
|
4 |
|
5 |
df = pd.read_csv('data.csv')
|
6 |
+
df_stats = pd.read_csv('data_stats_langs.csv')
|
7 |
+
map_models = df_stats[['model','model_name']].set_index('model').to_dict()
|
8 |
+
df = df.rename(columns=map_models['model_name'])
|
9 |
|
10 |
|
11 |
+
models = sorted(df.columns.tolist())
|
|
|
12 |
models.remove('hash')
|
13 |
models.remove('message')
|
14 |
messages = sorted(df['message'].tolist(), key=len)
|
15 |
+
messages_select = [(m[:250],m) for m in messages]
|
16 |
+
|
17 |
|
18 |
def out(message, model1, model2):
|
19 |
row = df[df['message'] == message]
|
|
|
21 |
output2 = row[model2].values[0]
|
22 |
return message, output1, output2
|
23 |
|
24 |
+
OUTPUT_DESCRIPTION='''How good are OpenSource LLMs in German? I've benchmarked a couple of models and generated outputs for about 250 prompts to compare the models.
|
25 |
+
|
26 |
+
For information about the used dataset and generation see the [README.md](https://huggingface.co/spaces/floleuerer/german_llm_outputs/blob/main/README.md)
|
27 |
+
|
28 |
+
Select a Prompt and the models you would like to compare -> hit "Show Outputs"
|
29 |
+
'''
|
30 |
+
|
31 |
+
BENCHMARK_DESCRIPTION='''# Columns
|
32 |
+
de: German Benchmark results (arc, hellaswag, mmlu)
|
33 |
+
en: English Benchmark results (arc, hellaswag, mmlu)
|
34 |
+
de_frac: Given a german prompt - how often does the model correctly respond in German?
|
35 |
+
'''
|
36 |
|
37 |
with gr.Blocks() as iface:
|
38 |
+
with gr.Tab('Model Outputs'):
|
39 |
+
gr.Markdown(OUTPUT_DESCRIPTION)
|
40 |
+
with gr.Row():
|
41 |
+
drop_message = gr.Dropdown(messages_select, label='Prompt', value=random.choice(messages))
|
42 |
+
with gr.Row():
|
43 |
+
drop_model1 = gr.Dropdown(models, label='Model 1', value=random.choice(models))
|
44 |
+
drop_model2 = gr.Dropdown(models, label='Model 2', value=random.choice(models))
|
45 |
+
with gr.Row():
|
46 |
+
btn = gr.Button("Show Outputs")
|
47 |
+
with gr.Row():
|
48 |
+
out_message = gr.TextArea(label='Prompt')
|
49 |
+
with gr.Row():
|
50 |
+
out_model1 = gr.TextArea(label='Output Model 1')
|
51 |
+
out_model2 = gr.TextArea(label='Output Model 2')
|
52 |
+
with gr.Tab('Benchmarks'):
|
53 |
+
gr.Markdown(BENCHMARK_DESCRIPTION)
|
54 |
+
gr.Dataframe(df_stats.drop('model', axis=1))
|
55 |
+
|
56 |
+
|
57 |
|
58 |
btn.click(out,
|
59 |
inputs=[drop_message, drop_model1, drop_model2],
|
60 |
outputs=[out_message, out_model1, out_model2])
|
61 |
|
62 |
+
iface.launc()
|
model_stats.csv
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_name,de,en,model
|
2 |
+
mattshumer/mistral-8x7b-chat (Mistral MoE),0.5600046414510724,0.6436643654583162,mattshumer_mistral-8x7b-chat
|
3 |
+
DiscoResearch/mixtral-7b-8expert (Mistral MoE),0.530225066820544,0.6235667963921786,DiscoResearch_mixtral-7b-8expert
|
4 |
+
VAGOsolutions/SauerkrautLM-SOLAR-Instruct (Mistral),0.5264626841230896,0.6448543385417635,VAGOsolutions_SauerkrautLM-SOLAR-Instruct
|
5 |
+
upstage/SOLAR-10.7B-Instruct-v1.0 (Mistral),0.5185997487442598,0.6412418664969965,upstage_SOLAR-10.7B-Instruct-v1.0
|
6 |
+
openaccess-ai-collective/DPOpenHermes-7B (Mistral),0.48089969418378625,0.6204995440653475,openaccess-ai-collective_DPOpenHermes-7B
|
7 |
+
VAGOsolutions/SauerkrautLM-7b-HerO (Mistral),0.4793394346045998,0.5973278911555183,VAGOsolutions_SauerkrautLM-7b-HerO
|
8 |
+
malteos/hermeo-7b (Mistral),0.47863155933837304,0.5786517129900094,malteos_hermeo-7b
|
9 |
+
fblgit/una-cybertron-7b-v2-bf16 (Mistral),0.4714702998712998,0.6280572487472983,fblgit_una-cybertron-7b-v2-bf16
|
10 |
+
berkeley-nest/Starling-LM-7B-alpha (Mistral),0.4682092030538821,0.6082623478431886,berkeley-nest_Starling-LM-7B-alpha
|
11 |
+
openchat/openchat_3.5 (Mistral),0.4677697134012669,0.6093495408502738,openchat_openchat_3.5
|
12 |
+
mistralai/Mistral-7B-Instruct-v0.2 (Mistral),0.4671287660155921,0.5974778867312014,mistralai_Mistral-7B-Instruct-v0.2
|
13 |
+
teknium/OpenHermes-2.5-Mistral-7B (Mistral),0.4648073216754412,0.6009262837808728,teknium_OpenHermes-2.5-Mistral-7B
|
14 |
+
HuggingFaceH4/zephyr-7b-beta (Mistral),0.46414279659716645,0.5990940441734774,HuggingFaceH4_zephyr-7b-beta
|
15 |
+
VAGOsolutions/SauerkrautLM-7b-v1-mistral (Mistral),0.45828853906316275,0.5730988906371487,VAGOsolutions_SauerkrautLM-7b-v1-mistral
|
16 |
+
ehartford/dolphin-2.2.1-mistral-7b (Mistral),0.4575957336188839,0.5889248773933949,ehartford_dolphin-2.2.1-mistral-7b
|
17 |
+
argilla/notus-7b-v1 (Mistral),0.4538267074114633,0.6071679590684926,argilla_notus-7b-v1
|
18 |
+
TheBloke/Llama-2-70B-Chat-GPTQ (Llama2),0.45323342583643317,0.5790860529254064,TheBloke_Llama-2-70B-Chat-GPTQ
|
19 |
+
jphme/em_german_leo_mistral (Mistral),0.45282233724600435,0.4929055625300098,jphme_em_german_leo_mistral
|
20 |
+
Intel/neural-chat-7b-v3-1 (Mistral),0.44742151731716423,0.6027940574092564,Intel_neural-chat-7b-v3-1
|
21 |
+
LeoLM/leo-mistral-hessianai-7b-chat (Mistral),0.4411736805781934,0.4785774283431438,LeoLM_leo-mistral-hessianai-7b-chat
|
22 |
+
mistralai/Mistral-7B-v0.1 (Mistral),0.4386540667014251,0.5698303097528304,mistralai_Mistral-7B-v0.1
|
23 |
+
lmsys/vicuna-13b-v1.5 (Llama2),0.43458993199534945,0.5394977448615276,lmsys_vicuna-13b-v1.5
|
24 |
+
microsoft/Orca-2-13b (Llama2),0.4304508877524478,0.5665260610086892,microsoft_Orca-2-13b
|
25 |
+
LeoLM/leo-hessianai-7b-chat (Llama2),0.4198959530774699,0.46992795158940703,LeoLM_leo-hessianai-7b-chat
|
26 |
+
kaist-ai/prometheus-13b-v1.0 (Llama2),0.4137699872914333,0.5276183571587882,kaist-ai_prometheus-13b-v1.0
|
27 |
+
meta-llama/Llama-2-13b-chat-hf (Llama2),0.4133328486084298,0.5331779969903168,meta-llama_Llama-2-13b-chat-hf
|
28 |
+
Deci/DeciLM-7B-instruct (Deci),0.4087346114484392,0.5746784325286699,Deci_DeciLM-7B-instruct
|
29 |
+
LeoLM/leo-hessianai-7b-chat-bilingual (Llama2),0.40584143945101053,0.4701378898025081,LeoLM_leo-hessianai-7b-chat-bilingual
|
30 |
+
mistralai/Mistral-7B-Instruct-v0.1 (Mistral),0.40127209869969643,0.5329324676998503,mistralai_Mistral-7B-Instruct-v0.1
|
31 |
+
allenai/tulu-2-dpo-7b (Llama2),0.3963844596307675,0.5442215865769193,allenai_tulu-2-dpo-7b
|
32 |
+
Deci/DeciLM-7B (Deci),0.3891524333700293,0.5632148707253728,Deci_DeciLM-7B
|
33 |
+
microsoft/Orca-2-7b (Llama2),0.387631543457434,0.5275512116539839,microsoft_Orca-2-7b
|
34 |
+
Qwen/Qwen-7B (Qwen),0.3854029046626822,0.5250934481726963,Qwen_Qwen-7B
|
35 |
+
deepseek-ai/deepseek-llm-7b-chat (Deepseek),0.3805290423206953,0.5228634605739054,deepseek-ai_deepseek-llm-7b-chat
|
36 |
+
Qwen/Qwen-7B-Chat (Qwen),0.37947320299489906,0.501690371644847,Qwen_Qwen-7B-Chat
|
37 |
+
lmsys/vicuna-7b-v1.5 (Llama2),0.3743686900229406,0.4930702792560562,lmsys_vicuna-7b-v1.5
|
38 |
+
LeoLM/leo-hessianai-7b (Llama2),0.37147688304608356,0.4350647594408416,LeoLM_leo-hessianai-7b
|
39 |
+
meta-llama/Llama-2-7b-chat-hf (Llama2),0.35079143396246293,0.4932635613314786,meta-llama_Llama-2-7b-chat-hf
|
40 |
+
deepseek-ai/deepseek-llm-7b-base (Deepseek),0.3427809248139573,0.4823404844220252,deepseek-ai_deepseek-llm-7b-base
|
41 |
+
meta-llama/Llama-2-7b-hf (Llama2),0.34263087849908386,0.4704191232019917,meta-llama_Llama-2-7b-hf
|
42 |
+
01-ai/Yi-34B-Chat-8bits (Yi),0.3310580204778157,0.5418088737201365,01-ai_Yi-34B-Chat-8bits
|
43 |
+
mosaicml/mpt-7b-chat (Mosaic),0.3027332136150936,0.45647416228902604,mosaicml_mpt-7b-chat
|
44 |
+
mosaicml/mpt-7b-instruct (Mosaic),0.30104594868674767,0.4486541776873733,mosaicml_mpt-7b-instruct
|
45 |
+
mosaicml/mpt-7b (Mosaic),0.2907097637642126,0.42439863757918533,mosaicml_mpt-7b
|
46 |
+
microsoft/phi-2 (Phi),0.26701930278508595,0.5443139382853103,microsoft_phi-2
|
47 |
+
tiiuae/falcon-7b-instruct (Falcon),0.2605640889237062,0.38614111076652985,tiiuae_falcon-7b-instruct
|
48 |
+
microsoft/phi-1_5 (Phi),0.24409127820085472,0.44763163375883214,microsoft_phi-1_5
|
49 |
+
microsoft/phi-1 (Phi),0.2371333593192149,0.2460143542470721,microsoft_phi-1
|