Spaces:

speakleash
/

polish_eq-bench

Running

App Files Files Community

djstrong commited on 23 days ago

Commit

397694c

•

1 Parent(s): aa5756a

eq bench

Browse files

Files changed (5) hide show

README.md +3 -3
app2.py +81 -0
benchmark_results.csv +149 -0
requirements.txt +1 -15
src/about.py +2 -2

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Test123
-emoji: 🥇
 colorFrom: green
 colorTo: indigo
 sdk: gradio
-app_file: app.py
 pinned: true
 license: apache-2.0
 sdk_version: 4.37.1

 ---
+title: Polish EQ-Bench Leaderboard
+emoji: 🏆🩷🇵🇱
 colorFrom: green
 colorTo: indigo
 sdk: gradio
+app_file: app2.py
 pinned: true
 license: apache-2.0
 sdk_version: 4.37.1

app2.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import re
+import gradio as gr
+import numpy
+import pandas as pd
+from src.display.css_html_js import custom_css
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    # load dataframe from csv
+    # leaderboard_df = pd.read_csv("benchmark_results.csv")
+    leaderboard_df = []
+    with open("benchmark_results.csv", "r") as f:
+        header = f.readline().strip().split(",")
+        header = [h.strip() for h in header]
+        for i, line in enumerate(f):
+            leaderboard_df.append(line.strip().split(",", 13))
+    # create dataframe from list and header
+    leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
+    # filter column with value eq-bench_v2_pl
+    print(header)
+    leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (leaderboard_df["Benchmark Version"]=='eq-bench_pl')]
+    #fix: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
+    #leave only defined columns
+    leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
+    #create new column with model name
+    def parse_parseable(x):
+        if x["Num Questions Parseable"] == 'FAILED':
+            m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
+            print(m.group(1))
+            return m.group(1)
+        return x["Num Questions Parseable"]
+    leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(lambda x: parse_parseable(x), axis=1)
+    #change value of column to nan
+    leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
+    #set datatype of column
+    leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
+    leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
+    #set nan if value of column is less than 0
+    leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
+    #sort by 2 columns
+    leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False])
+    leaderboard_df_styled = leaderboard_df.style.background_gradient(cmap="RdYlGn")
+    rounding = {}
+    # for col in ["Benchmark Score", "Num Questions Parseable"]:
+    rounding["Benchmark Score"] = "{:.2f}"
+    rounding["Num Questions Parseable"] = "{:.0f}"
+    leaderboard_df_styled = leaderboard_df_styled.format(rounding)
+    leaderboard_table = gr.components.Dataframe(
+        value=leaderboard_df_styled,
+        # headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
+        # datatype=TYPES,
+        elem_id="leaderboard-table",
+        interactive=False,
+        visible=True,
+    )
+    demo.queue(default_concurrency_limit=40).launch()

benchmark_results.csv ADDED Viewed

	@@ -0,0 +1,149 @@

+Run ID, Benchmark Completed, Prompt Format, Model Path, Lora Path, Quantization, Benchmark Score, Benchmark Version, Num Questions Parseable, Num Iterations, Inference Engine, Ooba Params, Download Filters, Error
+Bielik_v0.1,2024-06-18 12:48:51,,speakleash/Bielik-7B-Instruct-v0.1,,,47.1,eq-bench_v2,170.0,1,transformers, ,,
+Bielik_v0.1,2024-06-18 13:44:54,,speakleash/Bielik-7B-Instruct-v0.1,,,34.17,eq-bench_v2_pl,149.0,1,transformers, ,,
+Bielik_v0.1,2024-06-18 14:01:46,,speakleash/Bielik-7B-Instruct-v0.1,,,34.27,eq-bench_v2_pl,156.0,1,transformers, ,,
+openchat-gemma,2024-06-18 14:03:04,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
+Bielik_v0.2,2024-06-18 14:10:38,,../models/gwint2,,,69.93,eq-bench_v2_pl,171.0,1,transformers, ,,
+Bielik_v0.2,2024-06-18 14:23:48,,../models/gwint2,,,72.37,eq-bench_v2,171.0,1,transformers, ,,
+openchat-35-0106,2024-06-18 14:30:24,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
+openchat-35-0106,2024-06-18 15:15:03,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
+glm-4-9b-chat,2024-06-18 15:16:14,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
+openchat-35-0106,2024-06-18 15:19:01,,openchat/openchat-3.5-0106,,,72.92,eq-bench_v2,171.0,1,transformers, ,,
+glm-4-9b-chat,2024-06-18 15:20:10,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
+openchat-35-0106,2024-06-18 15:22:41,,openchat/openchat-3.5-0106,,,45.69,eq-bench_v2_pl,170.0,1,transformers, ,,
+glm-4-9b-chat,2024-06-18 15:23:50,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
+glm-4-9b-chat,2024-06-18 15:26:30,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
+glm-4-9b-chat,2024-06-18 16:30:21,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,
+glm-4-9b-chat-1m,2024-06-18 16:54:28,,THUDM/glm-4-9b-chat-1m,,,FAILED,eq-bench,FAILED,1,transformers, ,,
+glm-4-9b-chat-1m,2024-06-18 17:05:16,,THUDM/glm-4-9b-chat-1m,,,FAILED,eq-bench,FAILED,1,transformers, ,,
+openchat-3.6-8b-20240522,2024-06-18 17:12:00,,openchat/openchat-3.6-8b-20240522,,,-1.339640900815702e+23,eq-bench_v2,171.0,1,transformers, ,,
+openchat-gemma,2024-06-18 17:13:12,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
+Meta-Llama-3-8B-Instruct,2024-06-18 21:29:03,,meta-llama/Meta-Llama-3-8B-Instruct,,,69.09,eq-bench_v2,171.0,1,transformers, ,,
+Starling-LM-7B-alpha,2024-06-18 21:45:18,,berkeley-nest/Starling-LM-7B-alpha,,,49.63,eq-bench_v2_pl,171.0,1,transformers, ,,
+Starling-LM-7B-beta,2024-06-18 21:51:54,,Nexusflow/Starling-LM-7B-beta,,,44.91,eq-bench_v2_pl,159.0,1,transformers, ,,
+Mistral-7B-Instruct-v0.2,2024-06-18 21:52:17,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
+Mistral-7B-Instruct-v0.1,2024-06-18 22:26:07,,mistralai/Mistral-7B-Instruct-v0.1,,,FAILED,eq-bench,FAILED,1,transformers, ,,Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)
+Meta-Llama-3-8B-Instruct,2024-06-18 22:35:53,,meta-llama/Meta-Llama-3-8B-Instruct,,,46.53,eq-bench_v2_pl,171.0,1,transformers, ,,
+openchat-gemma,2024-06-19 09:30:28,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
+Mistral-7B-Instruct-v0.2,2024-06-19 09:30:46,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
+openchat-gemma,2024-06-19 09:35:50,,openchat/openchat-3.5-0106-gemma,,,FAILED,eq-bench,FAILED,1,transformers, ,,System role not supported
+Mistral-7B-Instruct-v0.2,2024-06-19 09:36:01,,mistralai/Mistral-7B-Instruct-v0.2,,,FAILED,eq-bench,FAILED,1,transformers, ,,Conversation roles must alternate user/assistant/user/assistant/...
+openchat-gemma,2024-06-19 09:43:53,,openchat/openchat-3.5-0106-gemma,,,60.11,eq-bench_v2_pl,169.0,1,transformers, ,,
+Mistral-7B-Instruct-v0.2,2024-06-19 09:49:42,,mistralai/Mistral-7B-Instruct-v0.2,,,52.99,eq-bench_v2_pl,148.0,1,transformers, ,,
+openchat-gemma,2024-06-19 09:54:01,,openchat/openchat-3.5-0106-gemma,,,60.11,eq-bench_v2_pl,169.0,1,transformers, ,,
+openchat-gemma,2024-06-19 10:16:52,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-bench_v2_pl,170.0,1,transformers, ,,
+openchat-gemma,2024-06-19 10:19:44,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-bench_v2_pl,170.0,1,transformers, ,,
+Nous-Hermes-2-SOLAR-10.7B,2024-06-19 10:27:36,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,48.22,eq-bench_v2_pl,169.0,1,transformers, ,,
+SOLAR-10.7B-Instruct-v1.0,2024-06-19 10:43:47,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.57,eq-bench_v2_pl,164.0,1,transformers, ,,
+Qwen2-7B-Instruct,2024-06-19 10:46:52,,Qwen/Qwen2-7B-Instruct,,,53.08,eq-bench_v2_pl,171.0,1,transformers, ,,
+models/gwint1/hf,2024-06-19 10:55:32,,models/gwint1/hf,,,FAILED,eq-bench,FAILED,1,transformers, ,,Incorrect path_or_model_id: 'models/gwint1/hf'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
+models/gwint2,2024-06-19 10:55:32,,models/gwint2,,,FAILED,eq-bench,FAILED,1,transformers, ,,models/gwint2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
+../models/gwint1/hf,2024-06-19 10:56:07,,models/gwint1/hf,,,FAILED,eq-bench,FAILED,1,transformers, ,,Incorrect path_or_model_id: 'models/gwint1/hf'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
+../models/gwint2,2024-06-19 10:56:07,,models/gwint2,,,FAILED,eq-bench,FAILED,1,transformers, ,,models/gwint2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
+../models/gwint1/hf,2024-06-19 11:04:28,,models/gwint1/hf,,,FAILED,eq-bench,FAILED,1,transformers, ,,Incorrect path_or_model_id: 'models/gwint1/hf'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
+../models/gwint2,2024-06-19 11:04:29,,models/gwint2,,,FAILED,eq-bench,FAILED,1,transformers, ,,models/gwint2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
+models/gwint1/hf,2024-06-19 11:15:13,,../models/gwint1/hf,,,37.88,eq-bench_v2_pl,169.0,1,transformers, ,,
+models/gwint2,2024-06-19 11:21:15,,../models/gwint2,,,68.24,eq-bench_v2_pl,171.0,1,transformers, ,,
+Azurro/APT3-275M-Base,2024-06-19 11:36:43,,Azurro/APT3-275M-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+Qwen/Qwen2-0.5B,2024-06-19 11:47:44,,Qwen/Qwen2-0.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,18.0 questions were parseable (min is 83%)
+Qwen/Qwen2-0.5B-Instruct,2024-06-19 11:51:21,,Qwen/Qwen2-0.5B-Instruct,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,125.0 questions were parseable (min is 83%)
+allegro/plt5-large,2024-06-19 11:51:22,,allegro/plt5-large,,,FAILED,eq-bench,FAILED,1,transformers, ,,Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForCausalLM. Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, OlmoConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.
+APT3-1B-Instruct-e1,2024-06-19 11:51:22,,APT3-1B-Instruct-e1,,,FAILED,eq-bench,FAILED,1,transformers, ,,APT3-1B-Instruct-e1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
+APT3-1B-Instruct-e2,2024-06-19 11:51:23,,APT3-1B-Instruct-e2,,,FAILED,eq-bench,FAILED,1,transformers, ,,APT3-1B-Instruct-e2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
+Azurro/APT3-1B-Base,2024-06-19 12:00:40,,Azurro/APT3-1B-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+OPI-PG/Qra-1b,2024-06-19 12:13:15,,OPI-PG/Qra-1b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+TinyLlama/TinyLlama-1.1B-Chat-v1.0,2024-06-19 12:23:45,,TinyLlama/TinyLlama-1.1B-Chat-v1.0,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,36.0 questions were parseable (min is 83%)
+Qwen/Qwen2-1.5B,2024-06-19 12:35:37,,Qwen/Qwen2-1.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,54.0 questions were parseable (min is 83%)
+Qwen/Qwen2-1.5B-Instruct,2024-06-19 12:38:29,,Qwen/Qwen2-1.5B-Instruct,,,15.33,eq-bench_v2_pl,165.0,1,transformers, ,,
+sdadas/polish-gpt2-xl,2024-06-19 12:54:39,,sdadas/polish-gpt2-xl,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+internlm/internlm2-1_8b,2024-06-19 13:08:50,,internlm/internlm2-1_8b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+internlm/internlm2-chat-1_8b,2024-06-19 13:13:21,,internlm/internlm2-chat-1_8b,,,13.83,eq-bench_v2_pl,150.0,1,transformers, ,,
+google/gemma-1.1-2b-it,2024-06-19 13:15:24,,google/gemma-1.1-2b-it,,,16.47,eq-bench_v2_pl,171.0,1,transformers, ,,
+microsoft/phi-2,2024-06-19 13:28:07,,microsoft/phi-2,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+google/mt5-xl,2024-06-19 13:28:10,,google/mt5-xl,,,FAILED,eq-bench,FAILED,1,transformers, ,,Unrecognized configuration class <class 'transformers.models.mt5.configuration_mt5.MT5Config'> for this kind of AutoModel: AutoModelForCausalLM. Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, OlmoConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, InternLM2Config, InternLM2Config.
+microsoft/Phi-3-mini-4k-instruct,2024-06-19 13:34:56,,microsoft/Phi-3-mini-4k-instruct,,,28.05,eq-bench_v2_pl,159.0,1,transformers, ,,
+ssmits/Falcon2-5.5B-Polish,2024-06-19 13:47:21,,ssmits/Falcon2-5.5B-Polish,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+01-ai/Yi-1.5-6B,2024-06-19 14:04:20,,01-ai/Yi-1.5-6B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
+01-ai/Yi-1.5-6B-Chat,2024-06-19 14:11:22,,01-ai/Yi-1.5-6B-Chat,,,5.19,eq-bench_v2_pl,161.0,1,transformers, ,,
+THUDM/chatglm3-6b,2024-06-19 14:12:11,,THUDM/chatglm3-6b,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
+THUDM/chatglm3-6b-base,2024-06-19 14:13:00,,THUDM/chatglm3-6b-base,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
+alpindale/Mistral-7B-v0.2-hf,2024-06-19 14:16:37,,alpindale/Mistral-7B-v0.2-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,45.0 questions were parseable (min is 83%)
+berkeley-nest/Starling-LM-7B-alpha,2024-06-19 14:22:32,,berkeley-nest/Starling-LM-7B-alpha,,,46.26,eq-bench_v2_pl,171.0,1,transformers, ,,
+google/gemma-7b,2024-06-19 14:38:02,,google/gemma-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+google/gemma-7b-it,2024-06-19 14:53:28,,google/gemma-7b-it,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+HuggingFaceH4/zephyr-7b-alpha,2024-06-19 15:05:31,,HuggingFaceH4/zephyr-7b-alpha,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,99.0 questions were parseable (min is 83%)
+HuggingFaceH4/zephyr-7b-beta,2024-06-19 15:18:24,,HuggingFaceH4/zephyr-7b-beta,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,88.0 questions were parseable (min is 83%)
+internlm/internlm2-7b,2024-06-19 15:36:06,,internlm/internlm2-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,43.0 questions were parseable (min is 83%)
+internlm/internlm2-base-7b,2024-06-19 15:54:53,,internlm/internlm2-base-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,6.0 questions were parseable (min is 83%)
+internlm/internlm2-chat-7b,2024-06-19 16:02:07,,internlm/internlm2-chat-7b,,,40.0,eq-bench_v2_pl,169.0,1,transformers, ,,
+internlm/internlm2-chat-7b-sft,2024-06-19 16:07:04,,internlm/internlm2-chat-7b-sft,,,41.62,eq-bench_v2_pl,170.0,1,transformers, ,,
+lex-hue/Delexa-7b,2024-06-19 16:12:19,,lex-hue/Delexa-7b,,,49.03,eq-bench_v2_pl,169.0,1,transformers, ,,
+meta-llama/Llama-2-7b-chat-hf,2024-06-19 16:21:08,,meta-llama/Llama-2-7b-chat-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,116.0 questions were parseable (min is 83%)
+meta-llama/Llama-2-7b-hf,2024-06-19 16:36:41,,meta-llama/Llama-2-7b-hf,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
+microsoft/WizardLM-2-7B,2024-06-19 16:44:22,,microsoft/WizardLM-2-7B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,137.0 questions were parseable (min is 83%)
+mistralai/Mistral-7B-Instruct-v0.1,2024-06-19 16:44:33,,mistralai/Mistral-7B-Instruct-v0.1,,,FAILED,eq-bench,FAILED,1,transformers, ,,Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)
+mistralai/Mistral-7B-Instruct-v0.2,2024-06-19 16:50:36,,mistralai/Mistral-7B-Instruct-v0.2,,,53.25,eq-bench_v2_pl,151.0,1,transformers, ,,
+mistralai/Mistral-7B-Instruct-v0.3,2024-06-19 16:54:49,,mistralai/Mistral-7B-Instruct-v0.3,,,45.21,eq-bench_v2_pl,171.0,1,transformers, ,,
+mistralai/Mistral-7B-v0.1,2024-06-19 16:59:50,,mistralai/Mistral-7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,65.0 questions were parseable (min is 83%)
+mistralai/Mistral-7B-v0.3,2024-06-19 17:16:38,,mistralai/Mistral-7B-v0.3,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,14.0 questions were parseable (min is 83%)
+Nexusflow/Starling-LM-7B-beta,2024-06-19 17:23:18,,Nexusflow/Starling-LM-7B-beta,,,45.1,eq-bench_v2_pl,166.0,1,transformers, ,,
+openchat/openchat-3.5-0106,2024-06-19 17:27:10,,openchat/openchat-3.5-0106,,,43.81,eq-bench_v2_pl,171.0,1,transformers, ,,
+openchat/openchat-3.5-0106-gemma,2024-06-19 17:30:31,,openchat/openchat-3.5-0106-gemma,,,58.62,eq-bench_v2_pl,169.0,1,transformers, ,,
+openchat/openchat-3.5-1210,2024-06-19 17:34:27,,openchat/openchat-3.5-1210,,,49.04,eq-bench_v2_pl,171.0,1,transformers, ,,
+OPI-PG/Qra-7b,2024-06-19 17:50:28,,OPI-PG/Qra-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+Qwen/Qwen1.5-7B,2024-06-19 17:57:53,,Qwen/Qwen1.5-7B,,,23.11,eq-bench_v2_pl,155.0,1,transformers, ,,
+Qwen/Qwen1.5-7B-Chat,2024-06-19 18:03:34,,Qwen/Qwen1.5-7B-Chat,,,25.0,eq-bench_v2_pl,164.0,1,transformers, ,,
+Qwen/Qwen2-7B,2024-06-19 18:09:23,,Qwen/Qwen2-7B,,,36.58,eq-bench_v2_pl,166.0,1,transformers, ,,
+Qwen/Qwen2-7B-Instruct,2024-06-19 18:12:42,,Qwen/Qwen2-7B-Instruct,,,53.74,eq-bench_v2_pl,171.0,1,transformers, ,,
+Remek/Kruk-7B-SP-001,2024-06-19 18:17:13,,Remek/Kruk-7B-SP-001,,,44.44,eq-bench_v2_pl,171.0,1,transformers, ,,
+Remek/OpenChat-3.5-0106-PL-Omnibusv2,2024-06-19 18:17:24,,Remek/OpenChat-3.5-0106-PL-Omnibusv2,,,FAILED,eq-bench,FAILED,1,transformers, ,,'system_message' is undefined
+Remek/OpenChat3.5-0106-Spichlerz-Bocian,2024-06-19 18:24:08,,Remek/OpenChat3.5-0106-Spichlerz-Bocian,,,44.13,eq-bench_v2_pl,166.0,1,transformers, ,,
+Remek/OpenChat3.5-0106-Spichlerz-Inst-001,2024-06-19 18:28:48,,Remek/OpenChat3.5-0106-Spichlerz-Inst-001,,,41.6,eq-bench_v2_pl,171.0,1,transformers, ,,
+RWKV/HF_v5-Eagle-7B,2024-06-19 19:16:27,,RWKV/HF_v5-Eagle-7B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+RWKV/v5-Eagle-7B-HF,2024-06-19 20:04:12,,RWKV/v5-Eagle-7B-HF,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+speakleash/Bielik-7B-v0.1,2024-06-19 20:11:16,,speakleash/Bielik-7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,139.0 questions were parseable (min is 83%)
+szymonrucinski/Curie-7B-v1,2024-06-19 20:29:24,,szymonrucinski/Curie-7B-v1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
+teknium/OpenHermes-2.5-Mistral-7B,2024-06-19 20:34:12,,teknium/OpenHermes-2.5-Mistral-7B,,,37.48,eq-bench_v2_pl,171.0,1,transformers, ,,
+Voicelab/trurl-2-7b,2024-06-19 20:39:26,,Voicelab/trurl-2-7b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,141.0 questions were parseable (min is 83%)
+microsoft/Phi-3-small-8k-instruct,2024-06-19 20:39:31,,microsoft/Phi-3-small-8k-instruct,,,FAILED,eq-bench,FAILED,1,transformers, ,,No module named 'pytest'
+CohereForAI/aya-23-8B,2024-06-19 20:44:01,,CohereForAI/aya-23-8B,,,45.43,eq-bench_v2_pl,171.0,1,transformers, ,,
+meta-llama/Meta-Llama-3-8B,2024-06-19 21:01:55,,meta-llama/Meta-Llama-3-8B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+meta-llama/Meta-Llama-3-8B-Instruct,2024-06-19 21:06:08,,meta-llama/Meta-Llama-3-8B-Instruct,,,46.27,eq-bench_v2_pl,171.0,1,transformers, ,,
+mlabonne/NeuralDaredevil-8B-abliterated,2024-06-19 21:13:31,,mlabonne/NeuralDaredevil-8B-abliterated,,,54.74,eq-bench_v2_pl,171.0,1,transformers, ,,
+NousResearch/Hermes-2-Pro-Llama-3-8B,2024-06-19 21:18:18,,NousResearch/Hermes-2-Pro-Llama-3-8B,,,54.57,eq-bench_v2_pl,171.0,1,transformers, ,,
+NousResearch/Hermes-2-Theta-Llama-3-8B,2024-06-19 21:25:22,,NousResearch/Hermes-2-Theta-Llama-3-8B,,,54.88,eq-bench_v2_pl,171.0,1,transformers, ,,
+nvidia/Llama3-ChatQA-1.5-8B,2024-06-19 22:27:24,,nvidia/Llama3-ChatQA-1.5-8B,,,40.55,eq-bench_v2_pl,166.0,1,transformers, ,,
+openchat/openchat-3.6-8b-20240522,2024-06-19 22:34:56,,openchat/openchat-3.6-8b-20240522,,,-2.0090659464796595e+18,eq-bench_v2_pl,170.0,1,transformers, ,,
+Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT,2024-06-19 22:39:46,,Remek/Llama-3-8B-Omnibus-1-PL-v01-INSTRUCT,,,26.63,eq-bench_v2_pl,171.0,1,transformers, ,,
+01-ai/Yi-1.5-9B,2024-06-19 23:07:56,,01-ai/Yi-1.5-9B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
+01-ai/Yi-1.5-9B-Chat,2024-06-19 23:19:16,,01-ai/Yi-1.5-9B-Chat,,,48.78,eq-bench_v2_pl,163.0,1,transformers, ,,
+google/recurrentgemma-9b-it,2024-06-19 23:28:19,,google/recurrentgemma-9b-it,,,52.82,eq-bench_v2_pl,171.0,1,transformers, ,,
+THUDM/glm-4-9b,2024-06-19 23:28:41,,THUDM/glm-4-9b,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
+THUDM/glm-4-9b-chat,2024-06-19 23:29:01,,THUDM/glm-4-9b-chat,,,FAILED,eq-bench,FAILED,1,transformers, ,,too many values to unpack (expected 2)
+NousResearch/Nous-Hermes-2-SOLAR-10.7B,2024-06-19 23:51:07,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,49.85,eq-bench_v2_pl,169.0,1,transformers, ,,
+TeeZee/Bielik-SOLAR-LIKE-10.7B-Instruct-v0.1,2024-06-20 00:00:02,,TeeZee/Bielik-SOLAR-LIKE-10.7B-Instruct-v0.1,,,35.63,eq-bench_v2_pl,164.0,1,transformers, ,,
+upstage/SOLAR-10.7B-Instruct-v1.0,2024-06-20 00:19:48,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.35,eq-bench_v2_pl,162.0,1,transformers, ,,
+upstage/SOLAR-10.7B-v1.0,2024-06-20 01:12:51,,upstage/SOLAR-10.7B-v1.0,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,1.0 questions were parseable (min is 83%)
+tiiuae/falcon-11B,2024-06-20 01:23:54,,tiiuae/falcon-11B,,,42.41,eq-bench_v2_pl,171.0,1,transformers, ,,
+lmsys/vicuna-13b-v1.5,2024-06-20 01:43:40,,lmsys/vicuna-13b-v1.5,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,84.0 questions were parseable (min is 83%)
+OPI-PG/Qra-13b,2024-06-20 02:07:48,,OPI-PG/Qra-13b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
+teknium/OpenHermes-13B,2024-06-20 02:32:04,,teknium/OpenHermes-13B,,,36.85,eq-bench_v2_pl,162.0,1,transformers, ,,
+Voicelab/trurl-2-13b-academic,2024-06-20 02:38:04,,Voicelab/trurl-2-13b-academic,,,25.92,eq-bench_v2_pl,162.0,1,transformers, ,,
+microsoft/Phi-3-medium-4k-instruct,2024-06-20 02:46:38,,microsoft/Phi-3-medium-4k-instruct,,,57.07,eq-bench_v2_pl,169.0,1,transformers, ,,
+Qwen/Qwen1.5-14B-Chat,2024-06-20 02:52:13,,Qwen/Qwen1.5-14B-Chat,,,51.26,eq-bench_v2_pl,160.0,1,transformers, ,,
+internlm/internlm2-20b,2024-06-20 09:04:33,,internlm/internlm2-20b,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,4.0 questions were parseable (min is 83%)
+internlm/internlm2-chat-20b,2024-06-20 09:47:11,,internlm/internlm2-chat-20b,,,36.52,eq-bench_v2_pl,170.0,1,transformers, ,,
+Qwen/Qwen1.5-32B,2024-06-20 13:25:12,,Qwen/Qwen1.5-32B,,,54.35,eq-bench_v2_pl,170.0,1,transformers, ,,
+Qwen/Qwen1.5-32B-Chat,2024-06-20 13:34:52,,Qwen/Qwen1.5-32B-Chat,,,60.69,eq-bench_v2_pl,168.0,1,transformers, ,,
+01-ai/Yi-1.5-34B-Chat,2024-06-20 13:51:30,,01-ai/Yi-1.5-34B-Chat,,,46.32,eq-bench_v2_pl,171.0,1,transformers, ,,
+CohereForAI/aya-23-35B,2024-06-20 14:03:07,,CohereForAI/aya-23-35B,,,58.41,eq-bench_v2_pl,171.0,1,transformers, ,,
+CohereForAI/c4ai-command-r-v01,2024-06-20 14:14:54,,CohereForAI/c4ai-command-r-v01,,,56.43,eq-bench_v2_pl,171.0,1,transformers, ,,
+mistralai/Mixtral-8x7B-Instruct-v0.1,2024-06-20 14:35:28,,mistralai/Mixtral-8x7B-Instruct-v0.1,,,58.64,eq-bench_v2_pl,168.0,1,transformers, ,,
+mistralai/Mixtral-8x7B-v0.1,2024-06-20 15:30:24,,mistralai/Mixtral-8x7B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,10.0 questions were parseable (min is 83%)
+Qwen/Qwen2-57B-A14B-Instruct,2024-06-20 16:19:41,,Qwen/Qwen2-57B-A14B-Instruct,,,57.64,eq-bench_v2_pl,171.0,1,transformers, ,,
+meta-llama/Meta-Llama-3-70B,2024-06-20 16:59:30,,meta-llama/Meta-Llama-3-70B,,,46.1,eq-bench_v2_pl,145.0,1,transformers, ,,
+meta-llama/Meta-Llama-3-70B-Instruct,2024-06-20 17:15:58,,meta-llama/Meta-Llama-3-70B-Instruct,,,71.21,eq-bench_v2_pl,171.0,1,transformers, ,,
+Qwen/Qwen1.5-72B,2024-06-20 17:50:17,,Qwen/Qwen1.5-72B,,,53.96,eq-bench_v2_pl,163.0,1,transformers, ,,
+Qwen/Qwen1.5-72B-Chat,2024-06-20 18:06:58,,Qwen/Qwen1.5-72B-Chat,,,68.03,eq-bench_v2_pl,171.0,1,transformers, ,,
+Qwen/Qwen2-72B,2024-06-20 18:36:22,,Qwen/Qwen2-72B,,,69.75,eq-bench_v2_pl,169.0,1,transformers, ,,
+Qwen/Qwen2-72B-Instruct,2024-06-20 18:55:02,,Qwen/Qwen2-72B-Instruct,,,72.07,eq-bench_v2_pl,169.0,1,transformers, ,,

requirements.txt CHANGED Viewed

@@ -1,18 +1,4 @@
-APScheduler
-black
-click
-datasets
 gradio
 gradio_client
-huggingface-hub>=0.18.0
-matplotlib
-numpy
 pandas
-python-dateutil
-requests
-tqdm
-transformers
-tokenizers>=0.15.0
-git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
-accelerate
-sentencepiece

+tqdm
 gradio
 gradio_client
 pandas

src/about.py CHANGED Viewed

@@ -21,11 +21,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Polish EQ-Bench Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+Polish Emotional Intelligence Benchmark for LLMs
 """
 # Which evaluations are you running? how can people reproduce what you have?