metadata
base_model:
- RekaAI/reka-flash-3
vllm (pretrained=/root/autodl-tmp/reka-flash-3,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.720 |
± |
0.0285 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.676 |
± |
0.0297 |
vllm (pretrained=/root/autodl-tmp/reka-flash-3,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.724 |
± |
0.0200 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.684 |
± |
0.0208 |
vllm (pretrained=/root/autodl-tmp/reka-flash-3,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.6480 |
± |
0.0158 |
- humanities |
2 |
none |
|
acc |
↑ |
0.6615 |
± |
0.0328 |
- other |
2 |
none |
|
acc |
↑ |
0.6667 |
± |
0.0328 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.7167 |
± |
0.0334 |
- stem |
2 |
none |
|
acc |
↑ |
0.5825 |
± |
0.0284 |
vllm (pretrained=/root/autodl-tmp/84-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.700 |
± |
0.0290 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.648 |
± |
0.0303 |
vllm (pretrained=/root/autodl-tmp/84-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.692 |
± |
0.0207 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.648 |
± |
0.0214 |
vllm (pretrained=/root/autodl-tmp/84-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.6515 |
± |
0.0159 |
- humanities |
2 |
none |
|
acc |
↑ |
0.6718 |
± |
0.0325 |
- other |
2 |
none |
|
acc |
↑ |
0.6718 |
± |
0.0328 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.7056 |
± |
0.0341 |
- stem |
2 |
none |
|
acc |
↑ |
0.5895 |
± |
0.0286 |
vllm (pretrained=/root/autodl-tmp/848-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.692 |
± |
0.0293 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.660 |
± |
0.0300 |
vllm (pretrained=/root/autodl-tmp/848-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.724 |
± |
0.020 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.674 |
± |
0.021 |
vllm (pretrained=/root/autodl-tmp/848-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.6398 |
± |
0.0159 |
- humanities |
2 |
none |
|
acc |
↑ |
0.6513 |
± |
0.0333 |
- other |
2 |
none |
|
acc |
↑ |
0.6564 |
± |
0.0330 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.7222 |
± |
0.0333 |
- stem |
2 |
none |
|
acc |
↑ |
0.5684 |
± |
0.0284 |
vllm (pretrained=/root/autodl-tmp/8485-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.732 |
± |
0.0281 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.696 |
± |
0.0292 |
vllm (pretrained=/root/autodl-tmp/8485-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.720 |
± |
0.0201 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.692 |
± |
0.0207 |
vllm (pretrained=/root/autodl-tmp/8485-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.6550 |
± |
0.0158 |
- humanities |
2 |
none |
|
acc |
↑ |
0.6872 |
± |
0.0323 |
- other |
2 |
none |
|
acc |
↑ |
0.6769 |
± |
0.0327 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.7056 |
± |
0.0341 |
- stem |
2 |
none |
|
acc |
↑ |
0.5860 |
± |
0.0284 |
vllm (pretrained=/root/autodl-tmp/8485-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.74 |
± |
0.0278 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.68 |
± |
0.0296 |
vllm (pretrained=/root/autodl-tmp/8485-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.714 |
± |
0.0202 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.676 |
± |
0.0210 |
vllm (pretrained=/root/autodl-tmp/8485-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.6433 |
± |
0.0160 |
- humanities |
2 |
none |
|
acc |
↑ |
0.6513 |
± |
0.0337 |
- other |
2 |
none |
|
acc |
↑ |
0.6615 |
± |
0.0332 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.7111 |
± |
0.0338 |
- stem |
2 |
none |
|
acc |
↑ |
0.5825 |
± |
0.0284 |
vllm (pretrained=/root/autodl-tmp/85-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.696 |
± |
0.0292 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.648 |
± |
0.0303 |
vllm (pretrained=/root/autodl-tmp/85-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.708 |
± |
0.0204 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.660 |
± |
0.0212 |
vllm (pretrained=/root/autodl-tmp/85-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.6526 |
± |
0.0158 |
- humanities |
2 |
none |
|
acc |
↑ |
0.6615 |
± |
0.0331 |
- other |
2 |
none |
|
acc |
↑ |
0.6769 |
± |
0.0325 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.7389 |
± |
0.0327 |
- stem |
2 |
none |
|
acc |
↑ |
0.5754 |
± |
0.0287 |
vllm (pretrained=/root/autodl-tmp/85-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.708 |
± |
0.0288 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.648 |
± |
0.0303 |
vllm (pretrained=/root/autodl-tmp/85-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.720 |
± |
0.0201 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.658 |
± |
0.0212 |
vllm (pretrained=/root/autodl-tmp/85-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.6550 |
± |
0.0158 |
- humanities |
2 |
none |
|
acc |
↑ |
0.6769 |
± |
0.0324 |
- other |
2 |
none |
|
acc |
↑ |
0.6667 |
± |
0.0331 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.7278 |
± |
0.0329 |
- stem |
2 |
none |
|
acc |
↑ |
0.5860 |
± |
0.0284 |
vllm (pretrained=/root/autodl-tmp/86-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.696 |
± |
0.0292 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.636 |
± |
0.0305 |
vllm (pretrained=/root/autodl-tmp/86-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.690 |
± |
0.0207 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.648 |
± |
0.0214 |
vllm (pretrained=/root/autodl-tmp/86-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.6398 |
± |
0.0160 |
- humanities |
2 |
none |
|
acc |
↑ |
0.6410 |
± |
0.0336 |
- other |
2 |
none |
|
acc |
↑ |
0.6564 |
± |
0.0332 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.7278 |
± |
0.0333 |
- stem |
2 |
none |
|
acc |
↑ |
0.5719 |
± |
0.0284 |
vllm (pretrained=/root/autodl-tmp/86-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.688 |
± |
0.0294 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.640 |
± |
0.0304 |
vllm (pretrained=/root/autodl-tmp/86-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.706 |
± |
0.0204 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.660 |
± |
0.0212 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.6526 |
± |
0.0158 |
- humanities |
2 |
none |
|
acc |
↑ |
0.6821 |
± |
0.0327 |
- other |
2 |
none |
|
acc |
↑ |
0.6615 |
± |
0.0331 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.7278 |
± |
0.0329 |
- stem |
2 |
none |
|
acc |
↑ |
0.5789 |
± |
0.0284 |