|
--- |
|
license: apache-2.0 |
|
library_name: transformers |
|
tags: |
|
- merge |
|
pipeline_tag: text-generation |
|
model-index: |
|
- name: TheTop-5x7B-Instruct-D-v0.1 |
|
results: |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: AI2 Reasoning Challenge (25-Shot) |
|
type: ai2_arc |
|
config: ARC-Challenge |
|
split: test |
|
args: |
|
num_few_shot: 25 |
|
metrics: |
|
- type: acc_norm |
|
value: 71.76 |
|
name: normalized accuracy |
|
source: |
|
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-D-v0.1 |
|
name: Open LLM Leaderboard |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: HellaSwag (10-Shot) |
|
type: hellaswag |
|
split: validation |
|
args: |
|
num_few_shot: 10 |
|
metrics: |
|
- type: acc_norm |
|
value: 88.21 |
|
name: normalized accuracy |
|
source: |
|
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-D-v0.1 |
|
name: Open LLM Leaderboard |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: MMLU (5-Shot) |
|
type: cais/mmlu |
|
config: all |
|
split: test |
|
args: |
|
num_few_shot: 5 |
|
metrics: |
|
- type: acc |
|
value: 64.86 |
|
name: accuracy |
|
source: |
|
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-D-v0.1 |
|
name: Open LLM Leaderboard |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: TruthfulQA (0-shot) |
|
type: truthful_qa |
|
config: multiple_choice |
|
split: validation |
|
args: |
|
num_few_shot: 0 |
|
metrics: |
|
- type: mc2 |
|
value: 66.32 |
|
source: |
|
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-D-v0.1 |
|
name: Open LLM Leaderboard |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: Winogrande (5-shot) |
|
type: winogrande |
|
config: winogrande_xl |
|
split: validation |
|
args: |
|
num_few_shot: 5 |
|
metrics: |
|
- type: acc |
|
value: 84.37 |
|
name: accuracy |
|
source: |
|
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-D-v0.1 |
|
name: Open LLM Leaderboard |
|
- task: |
|
type: text-generation |
|
name: Text Generation |
|
dataset: |
|
name: GSM8k (5-shot) |
|
type: gsm8k |
|
config: main |
|
split: test |
|
args: |
|
num_few_shot: 5 |
|
metrics: |
|
- type: acc |
|
value: 71.72 |
|
name: accuracy |
|
source: |
|
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-D-v0.1 |
|
name: Open LLM Leaderboard |
|
--- |
|
|
|
Merge of top 7B models with DARE method |
|
|
|
> mergekit is a toolkit for merging pre-trained language models. mergekit uses an out-of-core approach to perform unreasonably elaborate merges in resource-constrained situations. Merges can be run entirely on CPU or accelerated with as little as 8 GB of VRAM. Many merging algorithms are supported, with more coming as they catch my attention. |
|
|
|
## Eval |
|
|
|
```python |
|
{ |
|
"all": { |
|
"acc": 0.6547370286177235, |
|
"acc_stderr": 0.03204709242170183, |
|
"acc_norm": 0.6537337854798912, |
|
"acc_norm_stderr": 0.03272317883588649, |
|
"mc1": 0.5189718482252142, |
|
"mc1_stderr": 0.01749089640576236, |
|
"mc2": 0.6631825155680797, |
|
"mc2_stderr": 0.01527641053841743 |
|
}, |
|
"harness|arc:challenge|25": { |
|
"acc": 0.6953924914675768, |
|
"acc_stderr": 0.013449522109932485, |
|
"acc_norm": 0.7175767918088737, |
|
"acc_norm_stderr": 0.013155456884097225 |
|
}, |
|
"harness|hellaswag|10": { |
|
"acc": 0.7120095598486357, |
|
"acc_stderr": 0.004519011688417168, |
|
"acc_norm": 0.8820952001593309, |
|
"acc_norm_stderr": 0.003218362717491129 |
|
}, |
|
"harness|hendrycksTest-abstract_algebra|5": { |
|
"acc": 0.33, |
|
"acc_stderr": 0.047258156262526045, |
|
"acc_norm": 0.33, |
|
"acc_norm_stderr": 0.047258156262526045 |
|
}, |
|
"harness|hendrycksTest-anatomy|5": { |
|
"acc": 0.6296296296296297, |
|
"acc_stderr": 0.041716541613545426, |
|
"acc_norm": 0.6296296296296297, |
|
"acc_norm_stderr": 0.041716541613545426 |
|
}, |
|
"harness|hendrycksTest-astronomy|5": { |
|
"acc": 0.7105263157894737, |
|
"acc_stderr": 0.03690677986137283, |
|
"acc_norm": 0.7105263157894737, |
|
"acc_norm_stderr": 0.03690677986137283 |
|
}, |
|
"harness|hendrycksTest-business_ethics|5": { |
|
"acc": 0.64, |
|
"acc_stderr": 0.04824181513244218, |
|
"acc_norm": 0.64, |
|
"acc_norm_stderr": 0.04824181513244218 |
|
}, |
|
"harness|hendrycksTest-clinical_knowledge|5": { |
|
"acc": 0.7056603773584905, |
|
"acc_stderr": 0.02804918631569525, |
|
"acc_norm": 0.7056603773584905, |
|
"acc_norm_stderr": 0.02804918631569525 |
|
}, |
|
"harness|hendrycksTest-college_biology|5": { |
|
"acc": 0.7638888888888888, |
|
"acc_stderr": 0.03551446610810826, |
|
"acc_norm": 0.7638888888888888, |
|
"acc_norm_stderr": 0.03551446610810826 |
|
}, |
|
"harness|hendrycksTest-college_chemistry|5": { |
|
"acc": 0.48, |
|
"acc_stderr": 0.050211673156867795, |
|
"acc_norm": 0.48, |
|
"acc_norm_stderr": 0.050211673156867795 |
|
}, |
|
"harness|hendrycksTest-college_computer_science|5": { |
|
"acc": 0.54, |
|
"acc_stderr": 0.05009082659620333, |
|
"acc_norm": 0.54, |
|
"acc_norm_stderr": 0.05009082659620333 |
|
}, |
|
"harness|hendrycksTest-college_mathematics|5": { |
|
"acc": 0.31, |
|
"acc_stderr": 0.04648231987117316, |
|
"acc_norm": 0.31, |
|
"acc_norm_stderr": 0.04648231987117316 |
|
}, |
|
"harness|hendrycksTest-college_medicine|5": { |
|
"acc": 0.6705202312138728, |
|
"acc_stderr": 0.03583901754736411, |
|
"acc_norm": 0.6705202312138728, |
|
"acc_norm_stderr": 0.03583901754736411 |
|
}, |
|
"harness|hendrycksTest-college_physics|5": { |
|
"acc": 0.4215686274509804, |
|
"acc_stderr": 0.04913595201274498, |
|
"acc_norm": 0.4215686274509804, |
|
"acc_norm_stderr": 0.04913595201274498 |
|
}, |
|
"harness|hendrycksTest-computer_security|5": { |
|
"acc": 0.78, |
|
"acc_stderr": 0.04163331998932263, |
|
"acc_norm": 0.78, |
|
"acc_norm_stderr": 0.04163331998932263 |
|
}, |
|
"harness|hendrycksTest-conceptual_physics|5": { |
|
"acc": 0.5787234042553191, |
|
"acc_stderr": 0.03227834510146268, |
|
"acc_norm": 0.5787234042553191, |
|
"acc_norm_stderr": 0.03227834510146268 |
|
}, |
|
"harness|hendrycksTest-econometrics|5": { |
|
"acc": 0.5, |
|
"acc_stderr": 0.047036043419179864, |
|
"acc_norm": 0.5, |
|
"acc_norm_stderr": 0.047036043419179864 |
|
}, |
|
"harness|hendrycksTest-electrical_engineering|5": { |
|
"acc": 0.5586206896551724, |
|
"acc_stderr": 0.04137931034482758, |
|
"acc_norm": 0.5586206896551724, |
|
"acc_norm_stderr": 0.04137931034482758 |
|
}, |
|
"harness|hendrycksTest-elementary_mathematics|5": { |
|
"acc": 0.42857142857142855, |
|
"acc_stderr": 0.02548718714785938, |
|
"acc_norm": 0.42857142857142855, |
|
"acc_norm_stderr": 0.02548718714785938 |
|
}, |
|
"harness|hendrycksTest-formal_logic|5": { |
|
"acc": 0.47619047619047616, |
|
"acc_stderr": 0.04467062628403273, |
|
"acc_norm": 0.47619047619047616, |
|
"acc_norm_stderr": 0.04467062628403273 |
|
}, |
|
"harness|hendrycksTest-global_facts|5": { |
|
"acc": 0.33, |
|
"acc_stderr": 0.04725815626252604, |
|
"acc_norm": 0.33, |
|
"acc_norm_stderr": 0.04725815626252604 |
|
}, |
|
"harness|hendrycksTest-high_school_biology|5": { |
|
"acc": 0.7903225806451613, |
|
"acc_stderr": 0.023157879349083525, |
|
"acc_norm": 0.7903225806451613, |
|
"acc_norm_stderr": 0.023157879349083525 |
|
}, |
|
"harness|hendrycksTest-high_school_chemistry|5": { |
|
"acc": 0.4876847290640394, |
|
"acc_stderr": 0.035169204442208966, |
|
"acc_norm": 0.4876847290640394, |
|
"acc_norm_stderr": 0.035169204442208966 |
|
}, |
|
"harness|hendrycksTest-high_school_computer_science|5": { |
|
"acc": 0.68, |
|
"acc_stderr": 0.04688261722621505, |
|
"acc_norm": 0.68, |
|
"acc_norm_stderr": 0.04688261722621505 |
|
}, |
|
"harness|hendrycksTest-high_school_european_history|5": { |
|
"acc": 0.7878787878787878, |
|
"acc_stderr": 0.03192271569548301, |
|
"acc_norm": 0.7878787878787878, |
|
"acc_norm_stderr": 0.03192271569548301 |
|
}, |
|
"harness|hendrycksTest-high_school_geography|5": { |
|
"acc": 0.797979797979798, |
|
"acc_stderr": 0.02860620428922987, |
|
"acc_norm": 0.797979797979798, |
|
"acc_norm_stderr": 0.02860620428922987 |
|
}, |
|
"harness|hendrycksTest-high_school_government_and_politics|5": { |
|
"acc": 0.9015544041450777, |
|
"acc_stderr": 0.021500249576033456, |
|
"acc_norm": 0.9015544041450777, |
|
"acc_norm_stderr": 0.021500249576033456 |
|
}, |
|
"harness|hendrycksTest-high_school_macroeconomics|5": { |
|
"acc": 0.6666666666666666, |
|
"acc_stderr": 0.023901157979402538, |
|
"acc_norm": 0.6666666666666666, |
|
"acc_norm_stderr": 0.023901157979402538 |
|
}, |
|
"harness|hendrycksTest-high_school_mathematics|5": { |
|
"acc": 0.35185185185185186, |
|
"acc_stderr": 0.029116617606083008, |
|
"acc_norm": 0.35185185185185186, |
|
"acc_norm_stderr": 0.029116617606083008 |
|
}, |
|
"harness|hendrycksTest-high_school_microeconomics|5": { |
|
"acc": 0.6722689075630253, |
|
"acc_stderr": 0.03048991141767323, |
|
"acc_norm": 0.6722689075630253, |
|
"acc_norm_stderr": 0.03048991141767323 |
|
}, |
|
"harness|hendrycksTest-high_school_physics|5": { |
|
"acc": 0.36423841059602646, |
|
"acc_stderr": 0.03929111781242742, |
|
"acc_norm": 0.36423841059602646, |
|
"acc_norm_stderr": 0.03929111781242742 |
|
}, |
|
"harness|hendrycksTest-high_school_psychology|5": { |
|
"acc": 0.8440366972477065, |
|
"acc_stderr": 0.015555802713590167, |
|
"acc_norm": 0.8440366972477065, |
|
"acc_norm_stderr": 0.015555802713590167 |
|
}, |
|
"harness|hendrycksTest-high_school_statistics|5": { |
|
"acc": 0.5092592592592593, |
|
"acc_stderr": 0.034093869469927006, |
|
"acc_norm": 0.5092592592592593, |
|
"acc_norm_stderr": 0.034093869469927006 |
|
}, |
|
"harness|hendrycksTest-high_school_us_history|5": { |
|
"acc": 0.8333333333333334, |
|
"acc_stderr": 0.026156867523931045, |
|
"acc_norm": 0.8333333333333334, |
|
"acc_norm_stderr": 0.026156867523931045 |
|
}, |
|
"harness|hendrycksTest-high_school_world_history|5": { |
|
"acc": 0.7848101265822784, |
|
"acc_stderr": 0.02675082699467618, |
|
"acc_norm": 0.7848101265822784, |
|
"acc_norm_stderr": 0.02675082699467618 |
|
}, |
|
"harness|hendrycksTest-human_aging|5": { |
|
"acc": 0.6905829596412556, |
|
"acc_stderr": 0.03102441174057221, |
|
"acc_norm": 0.6905829596412556, |
|
"acc_norm_stderr": 0.03102441174057221 |
|
}, |
|
"harness|hendrycksTest-human_sexuality|5": { |
|
"acc": 0.7786259541984732, |
|
"acc_stderr": 0.03641297081313729, |
|
"acc_norm": 0.7786259541984732, |
|
"acc_norm_stderr": 0.03641297081313729 |
|
}, |
|
"harness|hendrycksTest-international_law|5": { |
|
"acc": 0.7933884297520661, |
|
"acc_stderr": 0.03695980128098824, |
|
"acc_norm": 0.7933884297520661, |
|
"acc_norm_stderr": 0.03695980128098824 |
|
}, |
|
"harness|hendrycksTest-jurisprudence|5": { |
|
"acc": 0.7870370370370371, |
|
"acc_stderr": 0.0395783547198098, |
|
"acc_norm": 0.7870370370370371, |
|
"acc_norm_stderr": 0.0395783547198098 |
|
}, |
|
"harness|hendrycksTest-logical_fallacies|5": { |
|
"acc": 0.7730061349693251, |
|
"acc_stderr": 0.03291099578615769, |
|
"acc_norm": 0.7730061349693251, |
|
"acc_norm_stderr": 0.03291099578615769 |
|
}, |
|
"harness|hendrycksTest-machine_learning|5": { |
|
"acc": 0.45535714285714285, |
|
"acc_stderr": 0.047268355537191, |
|
"acc_norm": 0.45535714285714285, |
|
"acc_norm_stderr": 0.047268355537191 |
|
}, |
|
"harness|hendrycksTest-management|5": { |
|
"acc": 0.7766990291262136, |
|
"acc_stderr": 0.04123553189891431, |
|
"acc_norm": 0.7766990291262136, |
|
"acc_norm_stderr": 0.04123553189891431 |
|
}, |
|
"harness|hendrycksTest-marketing|5": { |
|
"acc": 0.8760683760683761, |
|
"acc_stderr": 0.021586494001281376, |
|
"acc_norm": 0.8760683760683761, |
|
"acc_norm_stderr": 0.021586494001281376 |
|
}, |
|
"harness|hendrycksTest-medical_genetics|5": { |
|
"acc": 0.72, |
|
"acc_stderr": 0.045126085985421276, |
|
"acc_norm": 0.72, |
|
"acc_norm_stderr": 0.045126085985421276 |
|
}, |
|
"harness|hendrycksTest-miscellaneous|5": { |
|
"acc": 0.8275862068965517, |
|
"acc_stderr": 0.013507943909371798, |
|
"acc_norm": 0.8275862068965517, |
|
"acc_norm_stderr": 0.013507943909371798 |
|
}, |
|
"harness|hendrycksTest-moral_disputes|5": { |
|
"acc": 0.7427745664739884, |
|
"acc_stderr": 0.02353292543104429, |
|
"acc_norm": 0.7427745664739884, |
|
"acc_norm_stderr": 0.02353292543104429 |
|
}, |
|
"harness|hendrycksTest-moral_scenarios|5": { |
|
"acc": 0.4312849162011173, |
|
"acc_stderr": 0.016563829399047707, |
|
"acc_norm": 0.4312849162011173, |
|
"acc_norm_stderr": 0.016563829399047707 |
|
}, |
|
"harness|hendrycksTest-nutrition|5": { |
|
"acc": 0.7320261437908496, |
|
"acc_stderr": 0.025360603796242557, |
|
"acc_norm": 0.7320261437908496, |
|
"acc_norm_stderr": 0.025360603796242557 |
|
}, |
|
"harness|hendrycksTest-philosophy|5": { |
|
"acc": 0.7170418006430869, |
|
"acc_stderr": 0.02558306248998481, |
|
"acc_norm": 0.7170418006430869, |
|
"acc_norm_stderr": 0.02558306248998481 |
|
}, |
|
"harness|hendrycksTest-prehistory|5": { |
|
"acc": 0.7438271604938271, |
|
"acc_stderr": 0.024288533637726095, |
|
"acc_norm": 0.7438271604938271, |
|
"acc_norm_stderr": 0.024288533637726095 |
|
}, |
|
"harness|hendrycksTest-professional_accounting|5": { |
|
"acc": 0.46808510638297873, |
|
"acc_stderr": 0.029766675075873866, |
|
"acc_norm": 0.46808510638297873, |
|
"acc_norm_stderr": 0.029766675075873866 |
|
}, |
|
"harness|hendrycksTest-professional_law|5": { |
|
"acc": 0.4726205997392438, |
|
"acc_stderr": 0.012751075788015055, |
|
"acc_norm": 0.4726205997392438, |
|
"acc_norm_stderr": 0.012751075788015055 |
|
}, |
|
"harness|hendrycksTest-professional_medicine|5": { |
|
"acc": 0.6801470588235294, |
|
"acc_stderr": 0.02833295951403121, |
|
"acc_norm": 0.6801470588235294, |
|
"acc_norm_stderr": 0.02833295951403121 |
|
}, |
|
"harness|hendrycksTest-professional_psychology|5": { |
|
"acc": 0.6748366013071896, |
|
"acc_stderr": 0.018950886770806315, |
|
"acc_norm": 0.6748366013071896, |
|
"acc_norm_stderr": 0.018950886770806315 |
|
}, |
|
"harness|hendrycksTest-public_relations|5": { |
|
"acc": 0.6909090909090909, |
|
"acc_stderr": 0.044262946482000985, |
|
"acc_norm": 0.6909090909090909, |
|
"acc_norm_stderr": 0.044262946482000985 |
|
}, |
|
"harness|hendrycksTest-security_studies|5": { |
|
"acc": 0.7306122448979592, |
|
"acc_stderr": 0.02840125202902294, |
|
"acc_norm": 0.7306122448979592, |
|
"acc_norm_stderr": 0.02840125202902294 |
|
}, |
|
"harness|hendrycksTest-sociology|5": { |
|
"acc": 0.835820895522388, |
|
"acc_stderr": 0.026193923544454115, |
|
"acc_norm": 0.835820895522388, |
|
"acc_norm_stderr": 0.026193923544454115 |
|
}, |
|
"harness|hendrycksTest-us_foreign_policy|5": { |
|
"acc": 0.85, |
|
"acc_stderr": 0.03588702812826371, |
|
"acc_norm": 0.85, |
|
"acc_norm_stderr": 0.03588702812826371 |
|
}, |
|
"harness|hendrycksTest-virology|5": { |
|
"acc": 0.5602409638554217, |
|
"acc_stderr": 0.03864139923699122, |
|
"acc_norm": 0.5602409638554217, |
|
"acc_norm_stderr": 0.03864139923699122 |
|
}, |
|
"harness|hendrycksTest-world_religions|5": { |
|
"acc": 0.8362573099415205, |
|
"acc_stderr": 0.028380919596145866, |
|
"acc_norm": 0.8362573099415205, |
|
"acc_norm_stderr": 0.028380919596145866 |
|
}, |
|
"harness|truthfulqa:mc|0": { |
|
"mc1": 0.5189718482252142, |
|
"mc1_stderr": 0.01749089640576236, |
|
"mc2": 0.6631825155680797, |
|
"mc2_stderr": 0.01527641053841743 |
|
}, |
|
"harness|winogrande|5": { |
|
"acc": 0.8437253354380426, |
|
"acc_stderr": 0.01020535179187352 |
|
}, |
|
"harness|gsm8k|5": { |
|
"acc": 0.7172100075815011, |
|
"acc_stderr": 0.012405020417873619 |
|
} |
|
} |
|
|
|
``` |
|
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) |
|
Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_MaziyarPanahi__TheTop-5x7B-Instruct-D-v0.1) |
|
|
|
| Metric |Value| |
|
|---------------------------------|----:| |
|
|Avg. |74.54| |
|
|AI2 Reasoning Challenge (25-Shot)|71.76| |
|
|HellaSwag (10-Shot) |88.21| |
|
|MMLU (5-Shot) |64.86| |
|
|TruthfulQA (0-shot) |66.32| |
|
|Winogrande (5-shot) |84.37| |
|
|GSM8k (5-shot) |71.72| |
|
|
|
|