Spaces:

floleuerer
/

german_llm_outputs

Running

App Files Files Community

Florian Leuerer commited on Jan 1

Commit

2ce0e9d

•

1 Parent(s): 8a2da5a

added benchmarks

Browse files

Files changed (2) hide show

app.py +38 -17
model_stats.csv +49 -0

app.py CHANGED Viewed

@@ -3,14 +3,17 @@ import pandas as pd
 import random
 df = pd.read_csv('data.csv')
-models = df.columns.tolist()
-print(models)
 models.remove('hash')
 models.remove('message')
 messages = sorted(df['message'].tolist(), key=len)
-messages_select = [(m[:150],m) for m in messages]
 def out(message, model1, model2):
     row = df[df['message'] == message]
@@ -18,24 +21,42 @@ def out(message, model1, model2):
     output2 = row[model2].values[0]
     return message, output1, output2
 with gr.Blocks() as iface:
-    gr.Markdown("For information about the used dataset and generation see the [README.md](https://huggingface.co/spaces/floleuerer/german_llm_outputs/blob/main/README.md)")
-    with gr.Row():
-        drop_message = gr.Dropdown(messages_select, label='Prompt', value=random.choice(messages))
-    with gr.Row():
-        drop_model1 = gr.Dropdown(models, label='Model 1', value=random.choice(models))
-        drop_model2 = gr.Dropdown(models, label='Model 2', value=random.choice(models))
-    with gr.Row():
-        btn = gr.Button("Show Outputs")
-    with gr.Row():
-        out_message = gr.TextArea(label='Prompt')
-    with gr.Row():
-        out_model1 = gr.TextArea(label='Output Model 1')
-        out_model2 = gr.TextArea(label='Output Model 2')
     btn.click(out,
     inputs=[drop_message, drop_model1, drop_model2],
     outputs=[out_message, out_model1, out_model2])
-iface.launch()

 import random
 df = pd.read_csv('data.csv')
+df_stats = pd.read_csv('data_stats_langs.csv')
+map_models = df_stats[['model','model_name']].set_index('model').to_dict()
+df = df.rename(columns=map_models['model_name'])
+models = sorted(df.columns.tolist())
 models.remove('hash')
 models.remove('message')
 messages = sorted(df['message'].tolist(), key=len)
+messages_select = [(m[:250],m) for m in messages]
 def out(message, model1, model2):
     row = df[df['message'] == message]
     output2 = row[model2].values[0]
     return message, output1, output2
+OUTPUT_DESCRIPTION='''How good are OpenSource LLMs in German? I've benchmarked a couple of models and generated outputs for about 250 prompts to compare the models.
+For information about the used dataset and generation see the [README.md](https://huggingface.co/spaces/floleuerer/german_llm_outputs/blob/main/README.md)
+Select a Prompt and the models you would like to compare -> hit "Show Outputs"
+'''
+BENCHMARK_DESCRIPTION='''# Columns
+de: German Benchmark results (arc, hellaswag, mmlu)
+en: English Benchmark results (arc, hellaswag, mmlu)
+de_frac: Given a german prompt - how often does the model correctly respond in German?
+'''
 with gr.Blocks() as iface:
+    with gr.Tab('Model Outputs'):
+        gr.Markdown(OUTPUT_DESCRIPTION)
+        with gr.Row():
+            drop_message = gr.Dropdown(messages_select, label='Prompt', value=random.choice(messages))
+        with gr.Row():
+            drop_model1 = gr.Dropdown(models, label='Model 1', value=random.choice(models))
+            drop_model2 = gr.Dropdown(models, label='Model 2', value=random.choice(models))
+        with gr.Row():
+            btn = gr.Button("Show Outputs")
+        with gr.Row():
+            out_message = gr.TextArea(label='Prompt')
+        with gr.Row():
+            out_model1 = gr.TextArea(label='Output Model 1')
+            out_model2 = gr.TextArea(label='Output Model 2')
+    with gr.Tab('Benchmarks'):
+        gr.Markdown(BENCHMARK_DESCRIPTION)
+        gr.Dataframe(df_stats.drop('model', axis=1))
     btn.click(out,
     inputs=[drop_message, drop_model1, drop_model2],
     outputs=[out_message, out_model1, out_model2])
+iface.launc()

model_stats.csv ADDED Viewed

	@@ -0,0 +1,49 @@

+model_name,de,en,model
+mattshumer/mistral-8x7b-chat (Mistral MoE),0.5600046414510724,0.6436643654583162,mattshumer_mistral-8x7b-chat
+DiscoResearch/mixtral-7b-8expert (Mistral MoE),0.530225066820544,0.6235667963921786,DiscoResearch_mixtral-7b-8expert
+VAGOsolutions/SauerkrautLM-SOLAR-Instruct (Mistral),0.5264626841230896,0.6448543385417635,VAGOsolutions_SauerkrautLM-SOLAR-Instruct
+upstage/SOLAR-10.7B-Instruct-v1.0 (Mistral),0.5185997487442598,0.6412418664969965,upstage_SOLAR-10.7B-Instruct-v1.0
+openaccess-ai-collective/DPOpenHermes-7B (Mistral),0.48089969418378625,0.6204995440653475,openaccess-ai-collective_DPOpenHermes-7B
+VAGOsolutions/SauerkrautLM-7b-HerO (Mistral),0.4793394346045998,0.5973278911555183,VAGOsolutions_SauerkrautLM-7b-HerO
+malteos/hermeo-7b (Mistral),0.47863155933837304,0.5786517129900094,malteos_hermeo-7b
+fblgit/una-cybertron-7b-v2-bf16 (Mistral),0.4714702998712998,0.6280572487472983,fblgit_una-cybertron-7b-v2-bf16
+berkeley-nest/Starling-LM-7B-alpha (Mistral),0.4682092030538821,0.6082623478431886,berkeley-nest_Starling-LM-7B-alpha
+openchat/openchat_3.5 (Mistral),0.4677697134012669,0.6093495408502738,openchat_openchat_3.5
+mistralai/Mistral-7B-Instruct-v0.2 (Mistral),0.4671287660155921,0.5974778867312014,mistralai_Mistral-7B-Instruct-v0.2
+teknium/OpenHermes-2.5-Mistral-7B (Mistral),0.4648073216754412,0.6009262837808728,teknium_OpenHermes-2.5-Mistral-7B
+HuggingFaceH4/zephyr-7b-beta (Mistral),0.46414279659716645,0.5990940441734774,HuggingFaceH4_zephyr-7b-beta
+VAGOsolutions/SauerkrautLM-7b-v1-mistral (Mistral),0.45828853906316275,0.5730988906371487,VAGOsolutions_SauerkrautLM-7b-v1-mistral
+ehartford/dolphin-2.2.1-mistral-7b (Mistral),0.4575957336188839,0.5889248773933949,ehartford_dolphin-2.2.1-mistral-7b
+argilla/notus-7b-v1 (Mistral),0.4538267074114633,0.6071679590684926,argilla_notus-7b-v1
+TheBloke/Llama-2-70B-Chat-GPTQ (Llama2),0.45323342583643317,0.5790860529254064,TheBloke_Llama-2-70B-Chat-GPTQ
+jphme/em_german_leo_mistral (Mistral),0.45282233724600435,0.4929055625300098,jphme_em_german_leo_mistral
+Intel/neural-chat-7b-v3-1 (Mistral),0.44742151731716423,0.6027940574092564,Intel_neural-chat-7b-v3-1
+LeoLM/leo-mistral-hessianai-7b-chat (Mistral),0.4411736805781934,0.4785774283431438,LeoLM_leo-mistral-hessianai-7b-chat
+mistralai/Mistral-7B-v0.1 (Mistral),0.4386540667014251,0.5698303097528304,mistralai_Mistral-7B-v0.1
+lmsys/vicuna-13b-v1.5 (Llama2),0.43458993199534945,0.5394977448615276,lmsys_vicuna-13b-v1.5
+microsoft/Orca-2-13b (Llama2),0.4304508877524478,0.5665260610086892,microsoft_Orca-2-13b
+LeoLM/leo-hessianai-7b-chat (Llama2),0.4198959530774699,0.46992795158940703,LeoLM_leo-hessianai-7b-chat
+kaist-ai/prometheus-13b-v1.0 (Llama2),0.4137699872914333,0.5276183571587882,kaist-ai_prometheus-13b-v1.0
+meta-llama/Llama-2-13b-chat-hf (Llama2),0.4133328486084298,0.5331779969903168,meta-llama_Llama-2-13b-chat-hf
+Deci/DeciLM-7B-instruct (Deci),0.4087346114484392,0.5746784325286699,Deci_DeciLM-7B-instruct
+LeoLM/leo-hessianai-7b-chat-bilingual (Llama2),0.40584143945101053,0.4701378898025081,LeoLM_leo-hessianai-7b-chat-bilingual
+mistralai/Mistral-7B-Instruct-v0.1 (Mistral),0.40127209869969643,0.5329324676998503,mistralai_Mistral-7B-Instruct-v0.1
+allenai/tulu-2-dpo-7b (Llama2),0.3963844596307675,0.5442215865769193,allenai_tulu-2-dpo-7b
+Deci/DeciLM-7B (Deci),0.3891524333700293,0.5632148707253728,Deci_DeciLM-7B
+microsoft/Orca-2-7b (Llama2),0.387631543457434,0.5275512116539839,microsoft_Orca-2-7b
+Qwen/Qwen-7B (Qwen),0.3854029046626822,0.5250934481726963,Qwen_Qwen-7B
+deepseek-ai/deepseek-llm-7b-chat (Deepseek),0.3805290423206953,0.5228634605739054,deepseek-ai_deepseek-llm-7b-chat
+Qwen/Qwen-7B-Chat (Qwen),0.37947320299489906,0.501690371644847,Qwen_Qwen-7B-Chat
+lmsys/vicuna-7b-v1.5 (Llama2),0.3743686900229406,0.4930702792560562,lmsys_vicuna-7b-v1.5
+LeoLM/leo-hessianai-7b (Llama2),0.37147688304608356,0.4350647594408416,LeoLM_leo-hessianai-7b
+meta-llama/Llama-2-7b-chat-hf (Llama2),0.35079143396246293,0.4932635613314786,meta-llama_Llama-2-7b-chat-hf
+deepseek-ai/deepseek-llm-7b-base (Deepseek),0.3427809248139573,0.4823404844220252,deepseek-ai_deepseek-llm-7b-base
+meta-llama/Llama-2-7b-hf (Llama2),0.34263087849908386,0.4704191232019917,meta-llama_Llama-2-7b-hf
+01-ai/Yi-34B-Chat-8bits (Yi),0.3310580204778157,0.5418088737201365,01-ai_Yi-34B-Chat-8bits
+mosaicml/mpt-7b-chat (Mosaic),0.3027332136150936,0.45647416228902604,mosaicml_mpt-7b-chat
+mosaicml/mpt-7b-instruct (Mosaic),0.30104594868674767,0.4486541776873733,mosaicml_mpt-7b-instruct
+mosaicml/mpt-7b (Mosaic),0.2907097637642126,0.42439863757918533,mosaicml_mpt-7b
+microsoft/phi-2 (Phi),0.26701930278508595,0.5443139382853103,microsoft_phi-2
+tiiuae/falcon-7b-instruct (Falcon),0.2605640889237062,0.38614111076652985,tiiuae_falcon-7b-instruct
+microsoft/phi-1_5 (Phi),0.24409127820085472,0.44763163375883214,microsoft_phi-1_5
+microsoft/phi-1 (Phi),0.2371333593192149,0.2460143542470721,microsoft_phi-1