djstrong commited on
Commit
397694c
β€’
1 Parent(s): aa5756a
Files changed (5) hide show
  1. README.md +3 -3
  2. app2.py +81 -0
  3. benchmark_results.csv +149 -0
  4. requirements.txt +1 -15
  5. src/about.py +2 -2
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Test123
3
- emoji: πŸ₯‡
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
  sdk_version: 4.37.1
 
1
  ---
2
+ title: Polish EQ-Bench Leaderboard
3
+ emoji: πŸ†πŸ©·πŸ‡΅πŸ‡±
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ app_file: app2.py
8
  pinned: true
9
  license: apache-2.0
10
  sdk_version: 4.37.1
app2.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import gradio as gr
4
+ import numpy
5
+ import pandas as pd
6
+
7
+ from src.display.css_html_js import custom_css
8
+ from src.about import (
9
+ CITATION_BUTTON_LABEL,
10
+ CITATION_BUTTON_TEXT,
11
+ EVALUATION_QUEUE_TEXT,
12
+ INTRODUCTION_TEXT,
13
+ LLM_BENCHMARKS_TEXT,
14
+ TITLE,
15
+ )
16
+
17
+ demo = gr.Blocks(css=custom_css)
18
+ with demo:
19
+ gr.HTML(TITLE)
20
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
21
+
22
+ # load dataframe from csv
23
+ # leaderboard_df = pd.read_csv("benchmark_results.csv")
24
+ leaderboard_df = []
25
+ with open("benchmark_results.csv", "r") as f:
26
+ header = f.readline().strip().split(",")
27
+ header = [h.strip() for h in header]
28
+ for i, line in enumerate(f):
29
+ leaderboard_df.append(line.strip().split(",", 13))
30
+ # create dataframe from list and header
31
+ leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
32
+ # filter column with value eq-bench_v2_pl
33
+ print(header)
34
+ leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (leaderboard_df["Benchmark Version"]=='eq-bench_pl')]
35
+ #fix: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
36
+
37
+
38
+ #leave only defined columns
39
+ leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
40
+
41
+ #create new column with model name
42
+ def parse_parseable(x):
43
+ if x["Num Questions Parseable"] == 'FAILED':
44
+ m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
45
+ print(m.group(1))
46
+ return m.group(1)
47
+ return x["Num Questions Parseable"]
48
+
49
+ leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(lambda x: parse_parseable(x), axis=1)
50
+
51
+ #change value of column to nan
52
+ leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
53
+
54
+ #set datatype of column
55
+ leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
56
+ leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
57
+
58
+ #set nan if value of column is less than 0
59
+ leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
60
+
61
+ #sort by 2 columns
62
+ leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False])
63
+
64
+ leaderboard_df_styled = leaderboard_df.style.background_gradient(cmap="RdYlGn")
65
+ rounding = {}
66
+ # for col in ["Benchmark Score", "Num Questions Parseable"]:
67
+
68
+ rounding["Benchmark Score"] = "{:.2f}"
69
+ rounding["Num Questions Parseable"] = "{:.0f}"
70
+ leaderboard_df_styled = leaderboard_df_styled.format(rounding)
71
+
72
+ leaderboard_table = gr.components.Dataframe(
73
+ value=leaderboard_df_styled,
74
+ # headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
75
+ # datatype=TYPES,
76
+ elem_id="leaderboard-table",
77
+ interactive=False,
78
+ visible=True,
79
+ )
80
+
81
+ demo.queue(default_concurrency_limit=40).launch()
benchmark_results.csv ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Run ID, Benchmark Completed, Prompt Format, Model Path, Lora Path, Quantization, Benchmark Score, Benchmark Version, Num Questions Parseable, Num Iterations, Inference Engine, Ooba Params, Download Filters, Error
2
+ Bielik_v0.1,2024-06-18 12:48:51,,speakleash/Bielik-7B-Instruct-v0.1,,,47.1,eq-bench_v2,170.0,1,transformers, ,,
3
+ Bielik_v0.1,2024-06-18 13:44:54,,speakleash/Bielik-7B-Instruct-v0.1,,,34.17,eq-bench_v2_pl,149.0,1,transformers, ,,
4
+ Bielik_v0.1,2024-06-18 14:01:46,,speakleash/Bielik-7B-Instruct-v0.1,,,34.27,eq-bench_v2_pl,156.0,1,transformers, ,,
5
+ openchat-gemma,2024-06-18 14:03:04,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
6
+ Bielik_v0.2,2024-06-18 14:10:38,,../models/gwint2,,,69.93,eq-bench_v2_pl,171.0,1,transformers, ,,
7
+ Bielik_v0.2,2024-06-18 14:23:48,,../models/gwint2,,,72.37,eq-bench_v2,171.0,1,transformers, ,,
8
+ openchat-35-0106,2024-06-18 14:30:24,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
9
+ openchat-35-0106,2024-06-18 15:15:03,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
10
+ glm-4-9b-chat,2024-06-18 15:16:14,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
11
+ openchat-35-0106,2024-06-18 15:19:01,,openchat/openchat-3.5-0106,,,72.92,eq-bench_v2,171.0,1,transformers, ,,
12
+ glm-4-9b-chat,2024-06-18 15:20:10,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
13
+ openchat-35-0106,2024-06-18 15:22:41,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
14
+ glm-4-9b-chat,2024-06-18 15:23:50,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
15
+ glm-4-9b-chat,2024-06-18 15:26:30,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
16
+ glm-4-9b-chat,2024-06-18 16:30:21,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
17
+ glm-4-9b-chat-1m,2024-06-18 16:54:28,,THUDM/glm-4-9b-chat-1m,,,FAILED,eq-bench,FAILED,1,transformers, ,,
18
+ glm-4-9b-chat-1m,2024-06-18 17:05:16,,THUDM/glm-4-9b-chat-1m,,,FAILED,eq-bench,FAILED,1,transformers, ,,
19
+ openchat-3.6-8b-20240522,2024-06-18 17:12:00,,openchat/openchat-3.6-8b-20240522,,,-1.339640900815702e+23,eq-bench_v2,171.0,1,transformers, ,,
20
+ openchat-gemma,2024-06-18 17:13:12,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
21
+ Meta-Llama-3-8B-Instruct,2024-06-18 21:29:03,,meta-llama/Meta-Llama-3-8B-Instruct,,,69.09,eq-bench_v2,171.0,1,transformers, ,,
22
+ Starling-LM-7B-alpha,2024-06-18 21:45:18,,berkeley-nest/Starling-LM-7B-alpha,,,49.63,eq-bench_v2_pl,171.0,1,transformers, ,,
23
+ Starling-LM-7B-beta,2024-06-18 21:51:54,,Nexusflow/Starling-LM-7B-beta,,,44.91,eq-bench_v2_pl,159.0,1,transformers, ,,
24
+ Mistral-7B-Instruct-v0.2,2024-06-18 21:52:17,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
25
+ Mistral-7B-Instruct-v0.1,2024-06-18 22:26:07,,mistralai/Mistral-7B-Instruct-v0.1,,,FAILED,eq-bench,FAILED,1,transformers, ,,Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)
26
+ Meta-Llama-3-8B-Instruct,2024-06-18 22:35:53,,meta-llama/Meta-Llama-3-8B-Instruct,,,46.53,eq-bench_v2_pl,171.0,1,transformers, ,,
27
+ openchat-gemma,2024-06-19 09:30:28,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
28
+ Mistral-7B-Instruct-v0.2,2024-06-19 09:30:46,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
29
+ openchat-gemma,2024-06-19 09:35:50,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
30
+ Mistral-7B-Instruct-v0.2,2024-06-19 09:36:01,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
31
+ openchat-gemma,2024-06-19 09:43:53,,openchat/openchat-3.5-0106-gemma,,,60.11,eq-bench_v2_pl,169.0,1,transformers, ,,
32
+ Mistral-7B-Instruct-v0.2,2024-06-19 09:49:42,,mistralai/Mistral-7B-Instruct-v0.2,,,52.99,eq-bench_v2_pl,148.0,1,transformers, ,,
33
+ openchat-gemma,2024-06-19 09:54:01,,openchat/openchat-3.5-0106-gemma,,,60.11,eq-bench_v2_pl,169.0,1,transformers, ,,
34
+ openchat-gemma,2024-06-19 10:16:52,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-bench_v2_pl,170.0,1,transformers, ,,
35
+ openchat-gemma,2024-06-19 10:19:44,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-bench_v2_pl,170.0,1,transformers, ,,
36
+ Nous-Hermes-2-SOLAR-10.7B,2024-06-19 10:27:36,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,48.22,eq-bench_v2_pl,169.0,1,transformers, ,,
37
+ SOLAR-10.7B-Instruct-v1.0,2024-06-19 10:43:47,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.57,eq-bench_v2_pl,164.0,1,transformers, ,,
38
+ Qwen2-7B-Instruct,2024-06-19 10:46:52,,Qwen/Qwen2-7B-Instruct,,,53.08,eq-bench_v2_pl,171.0,1,transformers, ,,
39
+ models/gwint1/hf,2024-06-19 10:55:32,,models/gwint1/hf,,,FAILED,eq-bench,FAILED,1,transformers, ,,Incorrect path_or_model_id: 'models/gwint1/hf'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
40
+ models/gwint2,2024-06-19 10:55:32,,models/gwint2,,,FAILED,eq-bench,FAILED,1,transformers, ,,models/gwint2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
41
+ ../models/gwint1/hf,2024-06-19 10:56:07,,models/gwint1/hf,,,FAILED,eq-bench,FAILED,1,transformers, ,,Incorrect path_or_model_id: 'models/gwint1/hf'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
42
+ ../models/gwint2,2024-06-19 10:56:07,,models/gwint2,,,FAILED,eq-bench,FAILED,1,transformers, ,,models/gwint2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
43
+ ../models/gwint1/hf,2024-06-19 11:04:28,,models/gwint1/hf,,,FAILED,eq-bench,FAILED,1,transformers, ,,Incorrect path_or_model_id: 'models/gwint1/hf'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
44
+ ../models/gwint2,2024-06-19 11:04:29,,models/gwint2,,,FAILED,eq-bench,FAILED,1,transformers, ,,models/gwint2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
45
+ models/gwint1/hf,2024-06-19 11:15:13,,../models/gwint1/hf,,,37.88,eq-bench_v2_pl,169.0,1,transformers, ,,
46
+ models/gwint2,2024-06-19 11:21:15,,../models/gwint2,,,68.24,eq-bench_v2_pl,171.0,1,transformers, ,,
47
+ Azurro/APT3-275M-Base,2024-06-19 11:36:43,,Azurro/APT3-275M-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
48
+ Qwen/Qwen2-0.5B,2024-06-19 11:47:44,,Qwen/Qwen2-0.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,18.0 questions were parseable (min is 83%)
49
+ Qwen/Qwen2-0.5B-Instruct,2024-06-19 11:51:21,,Qwen/Qwen2-0.5B-Instruct,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,125.0 questions were parseable (min is 83%)
50
+ allegro/plt5-large,2024-06-19 11:51:22,,allegro/plt5-large,,,FAILED,eq-bench,FAILED,1,transformers, ,,Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForCausalLM. Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, OlmoConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.
51
+ APT3-1B-Instruct-e1,2024-06-19 11:51:22,,APT3-1B-Instruct-e1,,,FAILED,eq-bench,FAILED,1,transformers, ,,APT3-1B-Instruct-e1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
52
+ APT3-1B-Instruct-e2,2024-06-19 11:51:23,,APT3-1B-Instruct-e2,,,FAILED,eq-bench,FAILED,1,transformers, ,,APT3-1B-Instruct-e2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
53
+ Azurro/APT3-1B-Base,2024-06-19 12:00:40,,Azurro/APT3-1B-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
54
+ OPI-PG/Qra-1b,2024-06-19 12:13:15,,OPI-PG/Qra-1b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
55
+ TinyLlama/TinyLlama-1.1B-Chat-v1.0,2024-06-19 12:23:45,,TinyLlama/TinyLlama-1.1B-Chat-v1.0,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,36.0 questions were parseable (min is 83%)
56
+ Qwen/Qwen2-1.5B,2024-06-19 12:35:37,,Qwen/Qwen2-1.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,54.0 questions were parseable (min is 83%)
57
+ Qwen/Qwen2-1.5B-Instruct,2024-06-19 12:38:29,,Qwen/Qwen2-1.5B-Instruct,,,15.33,eq-bench_v2_pl,165.0,1,transformers, ,,
58
+ sdadas/polish-gpt2-xl,2024-06-19 12:54:39,,sdadas/polish-gpt2-xl,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
59
+ internlm/internlm2-1_8b,2024-06-19 13:08:50,,internlm/internlm2-1_8b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
60
+ internlm/internlm2-chat-1_8b,2024-06-19 13:13:21,,internlm/internlm2-chat-1_8b,,,13.83,eq-bench_v2_pl,150.0,1,transformers, ,,
61
+ google/gemma-1.1-2b-it,2024-06-19 13:15:24,,google/gemma-1.1-2b-it,,,16.47,eq-bench_v2_pl,171.0,1,transformers, ,,
62
+ microsoft/phi-2,2024-06-19 13:28:07,,microsoft/phi-2,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
63
+ google/mt5-xl,2024-06-19 13:28:10,,google/mt5-xl,,,FAILED,eq-bench,FAILED,1,transformers, ,,Unrecognized configuration class <class 'transformers.models.mt5.configuration_mt5.MT5Config'> for this kind of AutoModel: AutoModelForCausalLM. Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, OlmoConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, InternLM2Config, InternLM2Config.
64
+ microsoft/Phi-3-mini-4k-instruct,2024-06-19 13:34:56,,microsoft/Phi-3-mini-4k-instruct,,,28.05,eq-bench_v2_pl,159.0,1,transformers, ,,
65
+ ssmits/Falcon2-5.5B-Polish,2024-06-19 13:47:21,,ssmits/Falcon2-5.5B-Polish,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
66
+ 01-ai/Yi-1.5-6B,2024-06-19 14:04:20,,01-ai/Yi-1.5-6B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
67
+ 01-ai/Yi-1.5-6B-Chat,2024-06-19 14:11:22,,01-ai/Yi-1.5-6B-Chat,,,5.19,eq-bench_v2_pl,161.0,1,transformers, ,,
68
+ THUDM/chatglm3-6b,2024-06-19 14:12:11,,THUDM/chatglm3-6b,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
69
+ THUDM/chatglm3-6b-base,2024-06-19 14:13:00,,THUDM/chatglm3-6b-base,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
70
+ alpindale/Mistral-7B-v0.2-hf,2024-06-19 14:16:37,,alpindale/Mistral-7B-v0.2-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,45.0 questions were parseable (min is 83%)
71
+ berkeley-nest/Starling-LM-7B-alpha,2024-06-19 14:22:32,,berkeley-nest/Starling-LM-7B-alpha,,,46.26,eq-bench_v2_pl,171.0,1,transformers, ,,
72
+ google/gemma-7b,2024-06-19 14:38:02,,google/gemma-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
73
+ google/gemma-7b-it,2024-06-19 14:53:28,,google/gemma-7b-it,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
74
+ HuggingFaceH4/zephyr-7b-alpha,2024-06-19 15:05:31,,HuggingFaceH4/zephyr-7b-alpha,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,99.0 questions were parseable (min is 83%)
75
+ HuggingFaceH4/zephyr-7b-beta,2024-06-19 15:18:24,,HuggingFaceH4/zephyr-7b-beta,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,88.0 questions were parseable (min is 83%)
76
+ internlm/internlm2-7b,2024-06-19 15:36:06,,internlm/internlm2-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,43.0 questions were parseable (min is 83%)
77
+ internlm/internlm2-base-7b,2024-06-19 15:54:53,,internlm/internlm2-base-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,6.0 questions were parseable (min is 83%)
78
+ internlm/internlm2-chat-7b,2024-06-19 16:02:07,,internlm/internlm2-chat-7b,,,40.0,eq-bench_v2_pl,169.0,1,transformers, ,,
79
+ internlm/internlm2-chat-7b-sft,2024-06-19 16:07:04,,internlm/internlm2-chat-7b-sft,,,41.62,eq-bench_v2_pl,170.0,1,transformers, ,,
80
+ lex-hue/Delexa-7b,2024-06-19 16:12:19,,lex-hue/Delexa-7b,,,49.03,eq-bench_v2_pl,169.0,1,transformers, ,,
81
+ meta-llama/Llama-2-7b-chat-hf,2024-06-19 16:21:08,,meta-llama/Llama-2-7b-chat-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,116.0 questions were parseable (min is 83%)
82
+ meta-llama/Llama-2-7b-hf,2024-06-19 16:36:41,,meta-llama/Llama-2-7b-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
83
+ microsoft/WizardLM-2-7B,2024-06-19 16:44:22,,microsoft/WizardLM-2-7B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,137.0 questions were parseable (min is 83%)
84
+ mistralai/Mistral-7B-Instruct-v0.1,2024-06-19 16:44:33,,mistralai/Mistral-7B-Instruct-v0.1,,,FAILED,eq-bench,FAILED,1,transformers, ,,Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)
85
+ mistralai/Mistral-7B-Instruct-v0.2,2024-06-19 16:50:36,,mistralai/Mistral-7B-Instruct-v0.2,,,53.25,eq-bench_v2_pl,151.0,1,transformers, ,,
86
+ mistralai/Mistral-7B-Instruct-v0.3,2024-06-19 16:54:49,,mistralai/Mistral-7B-Instruct-v0.3,,,45.21,eq-bench_v2_pl,171.0,1,transformers, ,,
87
+ mistralai/Mistral-7B-v0.1,2024-06-19 16:59:50,,mistralai/Mistral-7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,65.0 questions were parseable (min is 83%)
88
+ mistralai/Mistral-7B-v0.3,2024-06-19 17:16:38,,mistralai/Mistral-7B-v0.3,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,14.0 questions were parseable (min is 83%)
89
+ Nexusflow/Starling-LM-7B-beta,2024-06-19 17:23:18,,Nexusflow/Starling-LM-7B-beta,,,45.1,eq-bench_v2_pl,166.0,1,transformers, ,,
90
+ openchat/openchat-3.5-0106,2024-06-19 17:27:10,,openchat/openchat-3.5-0106,,,43.81,eq-bench_v2_pl,171.0,1,transformers, ,,
91
+ openchat/openchat-3.5-0106-gemma,2024-06-19 17:30:31,,openchat/openchat-3.5-0106-gemma,,,58.62,eq-bench_v2_pl,169.0,1,transformers, ,,
92
+ openchat/openchat-3.5-1210,2024-06-19 17:34:27,,openchat/openchat-3.5-1210,,,49.04,eq-bench_v2_pl,171.0,1,transformers, ,,
93
+ OPI-PG/Qra-7b,2024-06-19 17:50:28,,OPI-PG/Qra-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
94
+ Qwen/Qwen1.5-7B,2024-06-19 17:57:53,,Qwen/Qwen1.5-7B,,,23.11,eq-bench_v2_pl,155.0,1,transformers, ,,
95
+ Qwen/Qwen1.5-7B-Chat,2024-06-19 18:03:34,,Qwen/Qwen1.5-7B-Chat,,,25.0,eq-bench_v2_pl,164.0,1,transformers, ,,
96
+ Qwen/Qwen2-7B,2024-06-19 18:09:23,,Qwen/Qwen2-7B,,,36.58,eq-bench_v2_pl,166.0,1,transformers, ,,
97
+ Qwen/Qwen2-7B-Instruct,2024-06-19 18:12:42,,Qwen/Qwen2-7B-Instruct,,,53.74,eq-bench_v2_pl,171.0,1,transformers, ,,
98
+ Remek/Kruk-7B-SP-001,2024-06-19 18:17:13,,Remek/Kruk-7B-SP-001,,,44.44,eq-bench_v2_pl,171.0,1,transformers, ,,
99
+ Remek/OpenChat-3.5-0106-PL-Omnibusv2,2024-06-19 18:17:24,,Remek/OpenChat-3.5-0106-PL-Omnibusv2,,,FAILED,eq-bench,FAILED,1,transformers, ,,'system_message' is undefined
100
+ Remek/OpenChat3.5-0106-Spichlerz-Bocian,2024-06-19 18:24:08,,Remek/OpenChat3.5-0106-Spichlerz-Bocian,,,44.13,eq-bench_v2_pl,166.0,1,transformers, ,,
101
+ Remek/OpenChat3.5-0106-Spichlerz-Inst-001,2024-06-19 18:28:48,,Remek/OpenChat3.5-0106-Spichlerz-Inst-001,,,41.6,eq-bench_v2_pl,171.0,1,transformers, ,,
102
+ RWKV/HF_v5-Eagle-7B,2024-06-19 19:16:27,,RWKV/HF_v5-Eagle-7B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
103
+ RWKV/v5-Eagle-7B-HF,2024-06-19 20:04:12,,RWKV/v5-Eagle-7B-HF,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
104
+ speakleash/Bielik-7B-v0.1,2024-06-19 20:11:16,,speakleash/Bielik-7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,139.0 questions were parseable (min is 83%)
105
+ szymonrucinski/Curie-7B-v1,2024-06-19 20:29:24,,szymonrucinski/Curie-7B-v1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
106
+ teknium/OpenHermes-2.5-Mistral-7B,2024-06-19 20:34:12,,teknium/OpenHermes-2.5-Mistral-7B,,,37.48,eq-bench_v2_pl,171.0,1,transformers, ,,
107
+ Voicelab/trurl-2-7b,2024-06-19 20:39:26,,Voicelab/trurl-2-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,141.0 questions were parseable (min is 83%)
108
+ microsoft/Phi-3-small-8k-instruct,2024-06-19 20:39:31,,microsoft/Phi-3-small-8k-instruct,,,FAILED,eq-bench,FAILED,1,transformers, ,,No module named 'pytest'
109
+ CohereForAI/aya-23-8B,2024-06-19 20:44:01,,CohereForAI/aya-23-8B,,,45.43,eq-bench_v2_pl,171.0,1,transformers, ,,
110
+ meta-llama/Meta-Llama-3-8B,2024-06-19 21:01:55,,meta-llama/Meta-Llama-3-8B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
111
+ meta-llama/Meta-Llama-3-8B-Instruct,2024-06-19 21:06:08,,meta-llama/Meta-Llama-3-8B-Instruct,,,46.27,eq-bench_v2_pl,171.0,1,transformers, ,,
112
+ mlabonne/NeuralDaredevil-8B-abliterated,2024-06-19 21:13:31,,mlabonne/NeuralDaredevil-8B-abliterated,,,54.74,eq-bench_v2_pl,171.0,1,transformers, ,,
113
+ NousResearch/Hermes-2-Pro-Llama-3-8B,2024-06-19 21:18:18,,NousResearch/Hermes-2-Pro-Llama-3-8B,,,54.57,eq-bench_v2_pl,171.0,1,transformers, ,,
114
+ NousResearch/Hermes-2-Theta-Llama-3-8B,2024-06-19 21:25:22,,NousResearch/Hermes-2-Theta-Llama-3-8B,,,54.88,eq-bench_v2_pl,171.0,1,transformers, ,,
115
+ nvidia/Llama3-ChatQA-1.5-8B,2024-06-19 22:27:24,,nvidia/Llama3-ChatQA-1.5-8B,,,40.55,eq-bench_v2_pl,166.0,1,transformers, ,,
116
+ openchat/openchat-3.6-8b-20240522,2024-06-19 22:34:56,,openchat/openchat-3.6-8b-20240522,,,-2.0090659464796595e+18,eq-bench_v2_pl,170.0,1,transformers, ,,
117
+ Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT,2024-06-19 22:39:46,,Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT,,,26.63,eq-bench_v2_pl,171.0,1,transformers, ,,
118
+ 01-ai/Yi-1.5-9B,2024-06-19 23:07:56,,01-ai/Yi-1.5-9B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
119
+ 01-ai/Yi-1.5-9B-Chat,2024-06-19 23:19:16,,01-ai/Yi-1.5-9B-Chat,,,48.78,eq-bench_v2_pl,163.0,1,transformers, ,,
120
+ google/recurrentgemma-9b-it,2024-06-19 23:28:19,,google/recurrentgemma-9b-it,,,52.82,eq-bench_v2_pl,171.0,1,transformers, ,,
121
+ THUDM/glm-4-9b,2024-06-19 23:28:41,,THUDM/glm-4-9b,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
122
+ THUDM/glm-4-9b-chat,2024-06-19 23:29:01,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
123
+ NousResearch/Nous-Hermes-2-SOLAR-10.7B,2024-06-19 23:51:07,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,49.85,eq-bench_v2_pl,169.0,1,transformers, ,,
124
+ TeeZee/Bielik-SOLAR-LIKE-10.7B-Instruct-v0.1,2024-06-20 00:00:02,,TeeZee/Bielik-SOLAR-LIKE-10.7B-Instruct-v0.1,,,35.63,eq-bench_v2_pl,164.0,1,transformers, ,,
125
+ upstage/SOLAR-10.7B-Instruct-v1.0,2024-06-20 00:19:48,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.35,eq-bench_v2_pl,162.0,1,transformers, ,,
126
+ upstage/SOLAR-10.7B-v1.0,2024-06-20 01:12:51,,upstage/SOLAR-10.7B-v1.0,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
127
+ tiiuae/falcon-11B,2024-06-20 01:23:54,,tiiuae/falcon-11B,,,42.41,eq-bench_v2_pl,171.0,1,transformers, ,,
128
+ lmsys/vicuna-13b-v1.5,2024-06-20 01:43:40,,lmsys/vicuna-13b-v1.5,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,84.0 questions were parseable (min is 83%)
129
+ OPI-PG/Qra-13b,2024-06-20 02:07:48,,OPI-PG/Qra-13b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
130
+ teknium/OpenHermes-13B,2024-06-20 02:32:04,,teknium/OpenHermes-13B,,,36.85,eq-bench_v2_pl,162.0,1,transformers, ,,
131
+ Voicelab/trurl-2-13b-academic,2024-06-20 02:38:04,,Voicelab/trurl-2-13b-academic,,,25.92,eq-bench_v2_pl,162.0,1,transformers, ,,
132
+ microsoft/Phi-3-medium-4k-instruct,2024-06-20 02:46:38,,microsoft/Phi-3-medium-4k-instruct,,,57.07,eq-bench_v2_pl,169.0,1,transformers, ,,
133
+ Qwen/Qwen1.5-14B-Chat,2024-06-20 02:52:13,,Qwen/Qwen1.5-14B-Chat,,,51.26,eq-bench_v2_pl,160.0,1,transformers, ,,
134
+ internlm/internlm2-20b,2024-06-20 09:04:33,,internlm/internlm2-20b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,4.0 questions were parseable (min is 83%)
135
+ internlm/internlm2-chat-20b,2024-06-20 09:47:11,,internlm/internlm2-chat-20b,,,36.52,eq-bench_v2_pl,170.0,1,transformers, ,,
136
+ Qwen/Qwen1.5-32B,2024-06-20 13:25:12,,Qwen/Qwen1.5-32B,,,54.35,eq-bench_v2_pl,170.0,1,transformers, ,,
137
+ Qwen/Qwen1.5-32B-Chat,2024-06-20 13:34:52,,Qwen/Qwen1.5-32B-Chat,,,60.69,eq-bench_v2_pl,168.0,1,transformers, ,,
138
+ 01-ai/Yi-1.5-34B-Chat,2024-06-20 13:51:30,,01-ai/Yi-1.5-34B-Chat,,,46.32,eq-bench_v2_pl,171.0,1,transformers, ,,
139
+ CohereForAI/aya-23-35B,2024-06-20 14:03:07,,CohereForAI/aya-23-35B,,,58.41,eq-bench_v2_pl,171.0,1,transformers, ,,
140
+ CohereForAI/c4ai-command-r-v01,2024-06-20 14:14:54,,CohereForAI/c4ai-command-r-v01,,,56.43,eq-bench_v2_pl,171.0,1,transformers, ,,
141
+ mistralai/Mixtral-8x7B-Instruct-v0.1,2024-06-20 14:35:28,,mistralai/Mixtral-8x7B-Instruct-v0.1,,,58.64,eq-bench_v2_pl,168.0,1,transformers, ,,
142
+ mistralai/Mixtral-8x7B-v0.1,2024-06-20 15:30:24,,mistralai/Mixtral-8x7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,10.0 questions were parseable (min is 83%)
143
+ Qwen/Qwen2-57B-A14B-Instruct,2024-06-20 16:19:41,,Qwen/Qwen2-57B-A14B-Instruct,,,57.64,eq-bench_v2_pl,171.0,1,transformers, ,,
144
+ meta-llama/Meta-Llama-3-70B,2024-06-20 16:59:30,,meta-llama/Meta-Llama-3-70B,,,46.1,eq-bench_v2_pl,145.0,1,transformers, ,,
145
+ meta-llama/Meta-Llama-3-70B-Instruct,2024-06-20 17:15:58,,meta-llama/Meta-Llama-3-70B-Instruct,,,71.21,eq-bench_v2_pl,171.0,1,transformers, ,,
146
+ Qwen/Qwen1.5-72B,2024-06-20 17:50:17,,Qwen/Qwen1.5-72B,,,53.96,eq-bench_v2_pl,163.0,1,transformers, ,,
147
+ Qwen/Qwen1.5-72B-Chat,2024-06-20 18:06:58,,Qwen/Qwen1.5-72B-Chat,,,68.03,eq-bench_v2_pl,171.0,1,transformers, ,,
148
+ Qwen/Qwen2-72B,2024-06-20 18:36:22,,Qwen/Qwen2-72B,,,69.75,eq-bench_v2_pl,169.0,1,transformers, ,,
149
+ Qwen/Qwen2-72B-Instruct,2024-06-20 18:55:02,,Qwen/Qwen2-72B-Instruct,,,72.07,eq-bench_v2_pl,169.0,1,transformers, ,,
requirements.txt CHANGED
@@ -1,18 +1,4 @@
1
- APScheduler
2
- black
3
- click
4
- datasets
5
  gradio
6
  gradio_client
7
- huggingface-hub>=0.18.0
8
- matplotlib
9
- numpy
10
  pandas
11
- python-dateutil
12
- requests
13
- tqdm
14
- transformers
15
- tokenizers>=0.15.0
16
- git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
- accelerate
18
- sentencepiece
 
1
+ tqdm
 
 
 
2
  gradio
3
  gradio_client
 
 
 
4
  pandas
 
 
 
 
 
 
 
 
src/about.py CHANGED
@@ -21,11 +21,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
21
 
22
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">Polish EQ-Bench Leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
+ Polish Emotional Intelligence Benchmark for LLMs
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?