Florian Leuerer commited on
Commit
2ce0e9d
1 Parent(s): 8a2da5a

added benchmarks

Browse files
Files changed (2) hide show
  1. app.py +38 -17
  2. model_stats.csv +49 -0
app.py CHANGED
@@ -3,14 +3,17 @@ import pandas as pd
3
  import random
4
 
5
  df = pd.read_csv('data.csv')
 
 
 
6
 
7
 
8
- models = df.columns.tolist()
9
- print(models)
10
  models.remove('hash')
11
  models.remove('message')
12
  messages = sorted(df['message'].tolist(), key=len)
13
- messages_select = [(m[:150],m) for m in messages]
 
14
 
15
  def out(message, model1, model2):
16
  row = df[df['message'] == message]
@@ -18,24 +21,42 @@ def out(message, model1, model2):
18
  output2 = row[model2].values[0]
19
  return message, output1, output2
20
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  with gr.Blocks() as iface:
23
- gr.Markdown("For information about the used dataset and generation see the [README.md](https://huggingface.co/spaces/floleuerer/german_llm_outputs/blob/main/README.md)")
24
- with gr.Row():
25
- drop_message = gr.Dropdown(messages_select, label='Prompt', value=random.choice(messages))
26
- with gr.Row():
27
- drop_model1 = gr.Dropdown(models, label='Model 1', value=random.choice(models))
28
- drop_model2 = gr.Dropdown(models, label='Model 2', value=random.choice(models))
29
- with gr.Row():
30
- btn = gr.Button("Show Outputs")
31
- with gr.Row():
32
- out_message = gr.TextArea(label='Prompt')
33
- with gr.Row():
34
- out_model1 = gr.TextArea(label='Output Model 1')
35
- out_model2 = gr.TextArea(label='Output Model 2')
 
 
 
 
 
 
36
 
37
  btn.click(out,
38
  inputs=[drop_message, drop_model1, drop_model2],
39
  outputs=[out_message, out_model1, out_model2])
40
 
41
- iface.launch()
 
3
  import random
4
 
5
  df = pd.read_csv('data.csv')
6
+ df_stats = pd.read_csv('data_stats_langs.csv')
7
+ map_models = df_stats[['model','model_name']].set_index('model').to_dict()
8
+ df = df.rename(columns=map_models['model_name'])
9
 
10
 
11
+ models = sorted(df.columns.tolist())
 
12
  models.remove('hash')
13
  models.remove('message')
14
  messages = sorted(df['message'].tolist(), key=len)
15
+ messages_select = [(m[:250],m) for m in messages]
16
+
17
 
18
  def out(message, model1, model2):
19
  row = df[df['message'] == message]
 
21
  output2 = row[model2].values[0]
22
  return message, output1, output2
23
 
24
+ OUTPUT_DESCRIPTION='''How good are OpenSource LLMs in German? I've benchmarked a couple of models and generated outputs for about 250 prompts to compare the models.
25
+
26
+ For information about the used dataset and generation see the [README.md](https://huggingface.co/spaces/floleuerer/german_llm_outputs/blob/main/README.md)
27
+
28
+ Select a Prompt and the models you would like to compare -> hit "Show Outputs"
29
+ '''
30
+
31
+ BENCHMARK_DESCRIPTION='''# Columns
32
+ de: German Benchmark results (arc, hellaswag, mmlu)
33
+ en: English Benchmark results (arc, hellaswag, mmlu)
34
+ de_frac: Given a german prompt - how often does the model correctly respond in German?
35
+ '''
36
 
37
  with gr.Blocks() as iface:
38
+ with gr.Tab('Model Outputs'):
39
+ gr.Markdown(OUTPUT_DESCRIPTION)
40
+ with gr.Row():
41
+ drop_message = gr.Dropdown(messages_select, label='Prompt', value=random.choice(messages))
42
+ with gr.Row():
43
+ drop_model1 = gr.Dropdown(models, label='Model 1', value=random.choice(models))
44
+ drop_model2 = gr.Dropdown(models, label='Model 2', value=random.choice(models))
45
+ with gr.Row():
46
+ btn = gr.Button("Show Outputs")
47
+ with gr.Row():
48
+ out_message = gr.TextArea(label='Prompt')
49
+ with gr.Row():
50
+ out_model1 = gr.TextArea(label='Output Model 1')
51
+ out_model2 = gr.TextArea(label='Output Model 2')
52
+ with gr.Tab('Benchmarks'):
53
+ gr.Markdown(BENCHMARK_DESCRIPTION)
54
+ gr.Dataframe(df_stats.drop('model', axis=1))
55
+
56
+
57
 
58
  btn.click(out,
59
  inputs=[drop_message, drop_model1, drop_model2],
60
  outputs=[out_message, out_model1, out_model2])
61
 
62
+ iface.launc()
model_stats.csv ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,de,en,model
2
+ mattshumer/mistral-8x7b-chat (Mistral MoE),0.5600046414510724,0.6436643654583162,mattshumer_mistral-8x7b-chat
3
+ DiscoResearch/mixtral-7b-8expert (Mistral MoE),0.530225066820544,0.6235667963921786,DiscoResearch_mixtral-7b-8expert
4
+ VAGOsolutions/SauerkrautLM-SOLAR-Instruct (Mistral),0.5264626841230896,0.6448543385417635,VAGOsolutions_SauerkrautLM-SOLAR-Instruct
5
+ upstage/SOLAR-10.7B-Instruct-v1.0 (Mistral),0.5185997487442598,0.6412418664969965,upstage_SOLAR-10.7B-Instruct-v1.0
6
+ openaccess-ai-collective/DPOpenHermes-7B (Mistral),0.48089969418378625,0.6204995440653475,openaccess-ai-collective_DPOpenHermes-7B
7
+ VAGOsolutions/SauerkrautLM-7b-HerO (Mistral),0.4793394346045998,0.5973278911555183,VAGOsolutions_SauerkrautLM-7b-HerO
8
+ malteos/hermeo-7b (Mistral),0.47863155933837304,0.5786517129900094,malteos_hermeo-7b
9
+ fblgit/una-cybertron-7b-v2-bf16 (Mistral),0.4714702998712998,0.6280572487472983,fblgit_una-cybertron-7b-v2-bf16
10
+ berkeley-nest/Starling-LM-7B-alpha (Mistral),0.4682092030538821,0.6082623478431886,berkeley-nest_Starling-LM-7B-alpha
11
+ openchat/openchat_3.5 (Mistral),0.4677697134012669,0.6093495408502738,openchat_openchat_3.5
12
+ mistralai/Mistral-7B-Instruct-v0.2 (Mistral),0.4671287660155921,0.5974778867312014,mistralai_Mistral-7B-Instruct-v0.2
13
+ teknium/OpenHermes-2.5-Mistral-7B (Mistral),0.4648073216754412,0.6009262837808728,teknium_OpenHermes-2.5-Mistral-7B
14
+ HuggingFaceH4/zephyr-7b-beta (Mistral),0.46414279659716645,0.5990940441734774,HuggingFaceH4_zephyr-7b-beta
15
+ VAGOsolutions/SauerkrautLM-7b-v1-mistral (Mistral),0.45828853906316275,0.5730988906371487,VAGOsolutions_SauerkrautLM-7b-v1-mistral
16
+ ehartford/dolphin-2.2.1-mistral-7b (Mistral),0.4575957336188839,0.5889248773933949,ehartford_dolphin-2.2.1-mistral-7b
17
+ argilla/notus-7b-v1 (Mistral),0.4538267074114633,0.6071679590684926,argilla_notus-7b-v1
18
+ TheBloke/Llama-2-70B-Chat-GPTQ (Llama2),0.45323342583643317,0.5790860529254064,TheBloke_Llama-2-70B-Chat-GPTQ
19
+ jphme/em_german_leo_mistral (Mistral),0.45282233724600435,0.4929055625300098,jphme_em_german_leo_mistral
20
+ Intel/neural-chat-7b-v3-1 (Mistral),0.44742151731716423,0.6027940574092564,Intel_neural-chat-7b-v3-1
21
+ LeoLM/leo-mistral-hessianai-7b-chat (Mistral),0.4411736805781934,0.4785774283431438,LeoLM_leo-mistral-hessianai-7b-chat
22
+ mistralai/Mistral-7B-v0.1 (Mistral),0.4386540667014251,0.5698303097528304,mistralai_Mistral-7B-v0.1
23
+ lmsys/vicuna-13b-v1.5 (Llama2),0.43458993199534945,0.5394977448615276,lmsys_vicuna-13b-v1.5
24
+ microsoft/Orca-2-13b (Llama2),0.4304508877524478,0.5665260610086892,microsoft_Orca-2-13b
25
+ LeoLM/leo-hessianai-7b-chat (Llama2),0.4198959530774699,0.46992795158940703,LeoLM_leo-hessianai-7b-chat
26
+ kaist-ai/prometheus-13b-v1.0 (Llama2),0.4137699872914333,0.5276183571587882,kaist-ai_prometheus-13b-v1.0
27
+ meta-llama/Llama-2-13b-chat-hf (Llama2),0.4133328486084298,0.5331779969903168,meta-llama_Llama-2-13b-chat-hf
28
+ Deci/DeciLM-7B-instruct (Deci),0.4087346114484392,0.5746784325286699,Deci_DeciLM-7B-instruct
29
+ LeoLM/leo-hessianai-7b-chat-bilingual (Llama2),0.40584143945101053,0.4701378898025081,LeoLM_leo-hessianai-7b-chat-bilingual
30
+ mistralai/Mistral-7B-Instruct-v0.1 (Mistral),0.40127209869969643,0.5329324676998503,mistralai_Mistral-7B-Instruct-v0.1
31
+ allenai/tulu-2-dpo-7b (Llama2),0.3963844596307675,0.5442215865769193,allenai_tulu-2-dpo-7b
32
+ Deci/DeciLM-7B (Deci),0.3891524333700293,0.5632148707253728,Deci_DeciLM-7B
33
+ microsoft/Orca-2-7b (Llama2),0.387631543457434,0.5275512116539839,microsoft_Orca-2-7b
34
+ Qwen/Qwen-7B (Qwen),0.3854029046626822,0.5250934481726963,Qwen_Qwen-7B
35
+ deepseek-ai/deepseek-llm-7b-chat (Deepseek),0.3805290423206953,0.5228634605739054,deepseek-ai_deepseek-llm-7b-chat
36
+ Qwen/Qwen-7B-Chat (Qwen),0.37947320299489906,0.501690371644847,Qwen_Qwen-7B-Chat
37
+ lmsys/vicuna-7b-v1.5 (Llama2),0.3743686900229406,0.4930702792560562,lmsys_vicuna-7b-v1.5
38
+ LeoLM/leo-hessianai-7b (Llama2),0.37147688304608356,0.4350647594408416,LeoLM_leo-hessianai-7b
39
+ meta-llama/Llama-2-7b-chat-hf (Llama2),0.35079143396246293,0.4932635613314786,meta-llama_Llama-2-7b-chat-hf
40
+ deepseek-ai/deepseek-llm-7b-base (Deepseek),0.3427809248139573,0.4823404844220252,deepseek-ai_deepseek-llm-7b-base
41
+ meta-llama/Llama-2-7b-hf (Llama2),0.34263087849908386,0.4704191232019917,meta-llama_Llama-2-7b-hf
42
+ 01-ai/Yi-34B-Chat-8bits (Yi),0.3310580204778157,0.5418088737201365,01-ai_Yi-34B-Chat-8bits
43
+ mosaicml/mpt-7b-chat (Mosaic),0.3027332136150936,0.45647416228902604,mosaicml_mpt-7b-chat
44
+ mosaicml/mpt-7b-instruct (Mosaic),0.30104594868674767,0.4486541776873733,mosaicml_mpt-7b-instruct
45
+ mosaicml/mpt-7b (Mosaic),0.2907097637642126,0.42439863757918533,mosaicml_mpt-7b
46
+ microsoft/phi-2 (Phi),0.26701930278508595,0.5443139382853103,microsoft_phi-2
47
+ tiiuae/falcon-7b-instruct (Falcon),0.2605640889237062,0.38614111076652985,tiiuae_falcon-7b-instruct
48
+ microsoft/phi-1_5 (Phi),0.24409127820085472,0.44763163375883214,microsoft_phi-1_5
49
+ microsoft/phi-1 (Phi),0.2371333593192149,0.2460143542470721,microsoft_phi-1