Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-3b_1.3.0/result.json +71 -0
- scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.5.4/harness.sh +15 -0
- scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/harness.sh +16 -0
- scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/result.json +48 -0
- scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/result_2.json +35 -0
- scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/xwinograd_ja.result.json +22 -0
- scripts/yans/eval/lm-evaluation-harness/models/community/rinna-instruct-1b_0.1.0/harness.sh +12 -0
- scripts/yans/eval/lm-evaluation-harness/models/community/rinna-instruct-1b_0.1.0/result.json +71 -0
- scripts/yans/eval/lm-evaluation-harness/models/harness.conf +29 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b-chat/harness.sh +3 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b-chat/result.json +71 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b/harness.sh +4 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b/result.json +71 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/harness.jsquad-1.2.sh +3 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/harness.sh +3 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/result.json +71 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/result.jsquad-1.2.json +22 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/harness.jsquad-1.2.sh +3 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/harness.sh +3 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/result.json +71 -0
- scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/result.jsquad-1.2.json +22 -0
- scripts/yans/eval/lm-evaluation-harness/models/openai/gpt3/result.mgsm.json +0 -0
- scripts/yans/eval/lm-evaluation-harness/models/stablelm/harness.conf +8 -0
- scripts/yans/eval/lm-evaluation-harness/models/stablelm/stablelm-jp-3b-ja50_rp50-700b/harness.conf +2 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/anli_r2-v0-res.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/arc_challenge-v0-res.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_expletive_it_object_raising-v0-res.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-res.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-res.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/coqa-v0-greedy_until +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-res.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_nationality-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-res.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-res.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-res.json +1 -0
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-3b_1.3.0/result.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"jcommonsenseqa-1.1-0.3": {
|
4 |
+
"acc": 0.709562109025916,
|
5 |
+
"acc_stderr": 0.013576910133903362,
|
6 |
+
"acc_norm": 0.646112600536193,
|
7 |
+
"acc_norm_stderr": 0.014300978485999566
|
8 |
+
},
|
9 |
+
"jnli-1.1-0.3": {
|
10 |
+
"acc": 0.32251437962202134,
|
11 |
+
"acc_stderr": 0.009476621341954925,
|
12 |
+
"acc_norm": 0.2933442892358258,
|
13 |
+
"acc_norm_stderr": 0.009230425066070979
|
14 |
+
},
|
15 |
+
"marc_ja-1.1-0.3": {
|
16 |
+
"acc": 0.8955354028601326,
|
17 |
+
"acc_stderr": 0.00403956848782294,
|
18 |
+
"acc_norm": 0.8955354028601326,
|
19 |
+
"acc_norm_stderr": 0.00403956848782294
|
20 |
+
},
|
21 |
+
"xwinograd_ja": {
|
22 |
+
"acc": 0.6392075078206465,
|
23 |
+
"acc_stderr": 0.015515541059528105
|
24 |
+
},
|
25 |
+
"jsquad-1.1-0.3": {
|
26 |
+
"exact_match": 32.37280504277353,
|
27 |
+
"f1": 41.27834864657371
|
28 |
+
},
|
29 |
+
"jaqket_v2-0.1-0.3": {
|
30 |
+
"exact_match": 32.302405498281786,
|
31 |
+
"f1": 37.16938035237004
|
32 |
+
},
|
33 |
+
"xlsum_ja-1.0-0.3": {
|
34 |
+
"rouge2": 0.3661308935912556
|
35 |
+
},
|
36 |
+
"mgsm-1.0-0.3": {
|
37 |
+
"acc": 0.012,
|
38 |
+
"acc_stderr": 0.006900323023694269
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"versions": {
|
42 |
+
"jcommonsenseqa-1.1-0.3": 1.1,
|
43 |
+
"jnli-1.1-0.3": 1.1,
|
44 |
+
"marc_ja-1.1-0.3": 1.1,
|
45 |
+
"jsquad-1.1-0.3": 1.1,
|
46 |
+
"jaqket_v2-0.1-0.3": 0.1,
|
47 |
+
"xlsum_ja-1.0-0.3": 1.0,
|
48 |
+
"xwinograd_ja": 1.0,
|
49 |
+
"mgsm-1.0-0.3": 1.0
|
50 |
+
},
|
51 |
+
"config": {
|
52 |
+
"model": "hf-causal",
|
53 |
+
"model_args": "pretrained=${PROJECT_DIR}/instruction_tuning/outputs/open-calm-instruct-3b_1.3.0,tokenizer=cyberagent/open-calm-3b",
|
54 |
+
"num_fewshot": [
|
55 |
+
3,
|
56 |
+
3,
|
57 |
+
3,
|
58 |
+
2,
|
59 |
+
1,
|
60 |
+
1,
|
61 |
+
0,
|
62 |
+
5
|
63 |
+
],
|
64 |
+
"batch_size": null,
|
65 |
+
"device": "cuda",
|
66 |
+
"no_cache": false,
|
67 |
+
"limit": null,
|
68 |
+
"bootstrap_iters": 100000,
|
69 |
+
"description_dict": {}
|
70 |
+
}
|
71 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.5.4/harness.sh
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
PRETRAINED="${PROJECT_DIR}/sft/checkpoints/open-calm-instruct-7b_1.5.4/"
|
4 |
+
TOKENIZER="cyberagent/open-calm-7b"
|
5 |
+
MODEL_ARGS="pretrained=${PRETRAINED},tokenizer=${TOKENIZER}"
|
6 |
+
TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
|
7 |
+
NUM_FEWSHOT="2,3,3,3"
|
8 |
+
OUTPUT_PATH="models/community/cyberagent-open-calm-instruct-7b_1.5.4/result.json"
|
9 |
+
python main.py \
|
10 |
+
--model hf-causal \
|
11 |
+
--model_args $MODEL_ARGS \
|
12 |
+
--tasks $TASK \
|
13 |
+
--num_fewshot $NUM_FEWSHOT \
|
14 |
+
--device "cuda" \
|
15 |
+
--output_path $OUTPUT_PATH
|
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/harness.sh
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
PROJECT_DIR=""
|
4 |
+
PRETRAINED="${PROJECT_DIR}/sft/models/open-calm-instruct-7b_1.9.4/"
|
5 |
+
TOKENIZER="cyberagent/open-calm-7b"
|
6 |
+
MODEL_ARGS="pretrained=${PRETRAINED},tokenizer=${TOKENIZER}"
|
7 |
+
TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
|
8 |
+
NUM_FEWSHOT="2,3,3,3"
|
9 |
+
OUTPUT_PATH="models/community/cyberagent-open-calm-instruct-7b_1.9.4/result.json"
|
10 |
+
python main.py \
|
11 |
+
--model hf-causal \
|
12 |
+
--model_args $MODEL_ARGS \
|
13 |
+
--tasks $TASK \
|
14 |
+
--num_fewshot $NUM_FEWSHOT \
|
15 |
+
--device "cuda" \
|
16 |
+
--output_path $OUTPUT_PATH
|
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/result.json
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"jsquad-1.1-0.3": {
|
4 |
+
"exact_match": 35.862224223322826,
|
5 |
+
"f1": 46.264251886156444
|
6 |
+
},
|
7 |
+
"jcommonsenseqa-1.1-0.3": {
|
8 |
+
"acc": 0.7220732797140303,
|
9 |
+
"acc_stderr": 0.0133978430711737,
|
10 |
+
"acc_norm": 0.6586237712243074,
|
11 |
+
"acc_norm_stderr": 0.014181247513525484
|
12 |
+
},
|
13 |
+
"jnli-1.1-0.3": {
|
14 |
+
"acc": 0.3985209531635168,
|
15 |
+
"acc_stderr": 0.00992578301888985,
|
16 |
+
"acc_norm": 0.3742810188989318,
|
17 |
+
"acc_norm_stderr": 0.00981109569636444
|
18 |
+
},
|
19 |
+
"marc_ja-1.1-0.3": {
|
20 |
+
"acc": 0.8636363636363636,
|
21 |
+
"acc_stderr": 0.004564311271779641,
|
22 |
+
"acc_norm": 0.8636363636363636,
|
23 |
+
"acc_norm_stderr": 0.004564311271779641
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"versions": {
|
27 |
+
"jsquad-1.1-0.3": 1.1,
|
28 |
+
"jcommonsenseqa-1.1-0.3": 1.1,
|
29 |
+
"jnli-1.1-0.3": 1.1,
|
30 |
+
"marc_ja-1.1-0.3": 1.1
|
31 |
+
},
|
32 |
+
"config": {
|
33 |
+
"model": "hf-causal",
|
34 |
+
"model_args": "pretrained=sft/models/open-calm-instruct-7b_1.9.4/,tokenizer=cyberagent/open-calm-7b",
|
35 |
+
"num_fewshot": [
|
36 |
+
2,
|
37 |
+
3,
|
38 |
+
3,
|
39 |
+
3
|
40 |
+
],
|
41 |
+
"batch_size": null,
|
42 |
+
"device": "cuda",
|
43 |
+
"no_cache": false,
|
44 |
+
"limit": null,
|
45 |
+
"bootstrap_iters": 100000,
|
46 |
+
"description_dict": {}
|
47 |
+
}
|
48 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/result_2.json
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"jaqket_v2-0.1-0.3": {
|
4 |
+
"exact_match": 40.292096219931274,
|
5 |
+
"f1": 45.46446299023617
|
6 |
+
},
|
7 |
+
"xlsum_ja-1.0-0.3": {
|
8 |
+
"rouge2": 1.0537473567422466
|
9 |
+
},
|
10 |
+
"mgsm-1.0-0.3": {
|
11 |
+
"acc": 0.012,
|
12 |
+
"acc_stderr": 0.006900323023694277
|
13 |
+
}
|
14 |
+
},
|
15 |
+
"versions": {
|
16 |
+
"jaqket_v2-0.1-0.3": 0.1,
|
17 |
+
"xlsum_ja-1.0-0.3": 1.0,
|
18 |
+
"mgsm-1.0-0.3": 1.0
|
19 |
+
},
|
20 |
+
"config": {
|
21 |
+
"model": "hf-causal",
|
22 |
+
"model_args": "pretrained=sft/checkpoints/open-calm-instruct-7b_1.9.4/,tokenizer=cyberagent/open-calm-7b",
|
23 |
+
"num_fewshot": [
|
24 |
+
1,
|
25 |
+
1,
|
26 |
+
4
|
27 |
+
],
|
28 |
+
"batch_size": null,
|
29 |
+
"device": "cuda",
|
30 |
+
"no_cache": false,
|
31 |
+
"limit": null,
|
32 |
+
"bootstrap_iters": 100000,
|
33 |
+
"description_dict": {}
|
34 |
+
}
|
35 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/xwinograd_ja.result.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"xwinograd_ja": {
|
4 |
+
"acc": 0.6517205422314911,
|
5 |
+
"acc_stderr": 0.015392596336826887
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"xwinograd_ja": 1.0
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "hf-causal",
|
13 |
+
"model_args": "pretrained=sft/checkpoints/open-calm-instruct-7b_1.9.4/,tokenizer=cyberagent/open-calm-7b",
|
14 |
+
"num_fewshot": 0,
|
15 |
+
"batch_size": null,
|
16 |
+
"device": "cuda",
|
17 |
+
"no_cache": false,
|
18 |
+
"limit": null,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/community/rinna-instruct-1b_0.1.0/harness.sh
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -eu
|
3 |
+
PROJECT_DIR=""
|
4 |
+
MODEL_ARGS="pretrained=${PROJECT_DIR}/instruction_tuning/outputs/rinna-instruct-1b_0.1.0/,tokenizer=rinna/japanese-gpt-1b,use_fast=False"
|
5 |
+
TASK="jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jsquad-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
|
6 |
+
python main.py \
|
7 |
+
--model hf-causal \
|
8 |
+
--model_args $MODEL_ARGS \
|
9 |
+
--tasks $TASK \
|
10 |
+
--num_fewshot "3,3,3,2,1,1,0,5" \
|
11 |
+
--device "cuda" \
|
12 |
+
--output_path "models/rinna/rinna-instruct-1b_0.1.0/result.json"
|
scripts/yans/eval/lm-evaluation-harness/models/community/rinna-instruct-1b_0.1.0/result.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"jcommonsenseqa-1.1-0.3": {
|
4 |
+
"acc": 0.6175156389633601,
|
5 |
+
"acc_stderr": 0.014534828771100699,
|
6 |
+
"acc_norm": 0.5120643431635389,
|
7 |
+
"acc_norm_stderr": 0.014949361502212136
|
8 |
+
},
|
9 |
+
"jnli-1.1-0.3": {
|
10 |
+
"acc": 0.2921117502054232,
|
11 |
+
"acc_stderr": 0.009219042365016429,
|
12 |
+
"acc_norm": 0.25965488907148726,
|
13 |
+
"acc_norm_stderr": 0.00888882393679571
|
14 |
+
},
|
15 |
+
"marc_ja-1.1-0.3": {
|
16 |
+
"acc": 0.788280432507848,
|
17 |
+
"acc_stderr": 0.005395477692275257,
|
18 |
+
"acc_norm": 0.788280432507848,
|
19 |
+
"acc_norm_stderr": 0.005395477692275257
|
20 |
+
},
|
21 |
+
"xwinograd_ja": {
|
22 |
+
"acc": 0.6496350364963503,
|
23 |
+
"acc_stderr": 0.015413891595766074
|
24 |
+
},
|
25 |
+
"jsquad-1.1-0.3": {
|
26 |
+
"exact_match": 17.086897793786584,
|
27 |
+
"f1": 26.487150700412975
|
28 |
+
},
|
29 |
+
"jaqket_v2-0.1-0.3": {
|
30 |
+
"exact_match": 20.876288659793815,
|
31 |
+
"f1": 27.990153222111978
|
32 |
+
},
|
33 |
+
"xlsum_ja-1.0-0.3": {
|
34 |
+
"rouge2": 1.3375721893967882
|
35 |
+
},
|
36 |
+
"mgsm-1.0-0.3": {
|
37 |
+
"acc": 0.012,
|
38 |
+
"acc_stderr": 0.006900323023694273
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"versions": {
|
42 |
+
"jcommonsenseqa-1.1-0.3": 1.1,
|
43 |
+
"jnli-1.1-0.3": 1.1,
|
44 |
+
"marc_ja-1.1-0.3": 1.1,
|
45 |
+
"jsquad-1.1-0.3": 1.1,
|
46 |
+
"jaqket_v2-0.1-0.3": 0.1,
|
47 |
+
"xlsum_ja-1.0-0.3": 1.0,
|
48 |
+
"xwinograd_ja": 1.0,
|
49 |
+
"mgsm-1.0-0.3": 1.0
|
50 |
+
},
|
51 |
+
"config": {
|
52 |
+
"model": "hf-causal",
|
53 |
+
"model_args": "pretrained=${PROJECT_DIR}/instruction_tuning/outputs/rinna-instruct-1b_0.1.0/,tokenizer=rinna/japanese-gpt-1b,use_fast=False",
|
54 |
+
"num_fewshot": [
|
55 |
+
3,
|
56 |
+
3,
|
57 |
+
3,
|
58 |
+
2,
|
59 |
+
1,
|
60 |
+
1,
|
61 |
+
0,
|
62 |
+
5
|
63 |
+
],
|
64 |
+
"batch_size": null,
|
65 |
+
"device": "cuda",
|
66 |
+
"no_cache": false,
|
67 |
+
"limit": null,
|
68 |
+
"bootstrap_iters": 100000,
|
69 |
+
"description_dict": {}
|
70 |
+
}
|
71 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/harness.conf
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[model]
|
2 |
+
# the --model option to the training script
|
3 |
+
model = hf-causal
|
4 |
+
|
5 |
+
[tasks.jcommonsenseqa-1.1]
|
6 |
+
fewshot = 3
|
7 |
+
|
8 |
+
[tasks.jnli]
|
9 |
+
fewshot = 3
|
10 |
+
|
11 |
+
[tasks.marc_ja]
|
12 |
+
fewshot = 3
|
13 |
+
|
14 |
+
[tasks.jsquad-1.1]
|
15 |
+
fewshot = 2
|
16 |
+
|
17 |
+
[tasks.jaqket_v2-0.1]
|
18 |
+
fewshot = 1
|
19 |
+
|
20 |
+
[tasks.xlsum_ja]
|
21 |
+
fewshot = 1
|
22 |
+
|
23 |
+
[tasks.xwinograd_ja]
|
24 |
+
fewshot = 0
|
25 |
+
# This specifically has no prompt
|
26 |
+
prompt = ""
|
27 |
+
|
28 |
+
[tasks.mgsm]
|
29 |
+
fewshot = 5
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b-chat/harness.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_ARGS="pretrained=meta-llama/Llama-2-13b-chat-hf,use_accelerate=True"
|
2 |
+
TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
|
3 |
+
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-13b-chat/result.json" --batch_size 2 > models/llama2/llama2-13b-chat/harness.out 2> models/llama2/llama2-13b-chat/harness.err
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b-chat/result.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"jsquad-1.1-0.3": {
|
4 |
+
"exact_match": 67.69473210265646,
|
5 |
+
"f1": 82.68867939081463
|
6 |
+
},
|
7 |
+
"jcommonsenseqa-1.1-0.3": {
|
8 |
+
"acc": 0.7256478999106345,
|
9 |
+
"acc_stderr": 0.01334431281465833,
|
10 |
+
"acc_norm": 0.3967828418230563,
|
11 |
+
"acc_norm_stderr": 0.01463161897855815
|
12 |
+
},
|
13 |
+
"jnli-1.1-0.3": {
|
14 |
+
"acc": 0.3562037797863599,
|
15 |
+
"acc_stderr": 0.009708506341194316,
|
16 |
+
"acc_norm": 0.3648315529991783,
|
17 |
+
"acc_norm_stderr": 0.00975932091977734
|
18 |
+
},
|
19 |
+
"marc_ja-1.1-0.3": {
|
20 |
+
"acc": 0.5992217898832685,
|
21 |
+
"acc_stderr": 0.006517879943818406,
|
22 |
+
"acc_norm": 0.5992217898832685,
|
23 |
+
"acc_norm_stderr": 0.006517879943818406
|
24 |
+
},
|
25 |
+
"jaqket_v2-0.1-0.3": {
|
26 |
+
"exact_match": 48.1958762886598,
|
27 |
+
"f1": 63.75233331776556
|
28 |
+
},
|
29 |
+
"xlsum_ja-1.0-0.3": {
|
30 |
+
"rouge2": 15.14282905950018
|
31 |
+
},
|
32 |
+
"mgsm-1.0-0.3": {
|
33 |
+
"acc": 0.132,
|
34 |
+
"acc_stderr": 0.021450980824038107
|
35 |
+
},
|
36 |
+
"xwinograd_ja": {
|
37 |
+
"acc": 0.6381647549530761,
|
38 |
+
"acc_stderr": 0.015525267319875928
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"versions": {
|
42 |
+
"jsquad-1.1-0.3": 1.1,
|
43 |
+
"jcommonsenseqa-1.1-0.3": 1.1,
|
44 |
+
"jnli-1.1-0.3": 1.1,
|
45 |
+
"marc_ja-1.1-0.3": 1.1,
|
46 |
+
"jaqket_v2-0.1-0.3": 0.1,
|
47 |
+
"xlsum_ja-1.0-0.3": 1.0,
|
48 |
+
"xwinograd_ja": 1.0,
|
49 |
+
"mgsm-1.0-0.3": 1.0
|
50 |
+
},
|
51 |
+
"config": {
|
52 |
+
"model": "hf-causal-experimental",
|
53 |
+
"model_args": "pretrained=meta-llama/Llama-2-13b-chat-hf,use_accelerate=True",
|
54 |
+
"num_fewshot": [
|
55 |
+
2,
|
56 |
+
3,
|
57 |
+
3,
|
58 |
+
3,
|
59 |
+
1,
|
60 |
+
1,
|
61 |
+
0,
|
62 |
+
5
|
63 |
+
],
|
64 |
+
"batch_size": 2,
|
65 |
+
"device": "cuda",
|
66 |
+
"no_cache": false,
|
67 |
+
"limit": null,
|
68 |
+
"bootstrap_iters": 100000,
|
69 |
+
"description_dict": {}
|
70 |
+
}
|
71 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b/harness.sh
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_ARGS="pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True"
|
2 |
+
|
3 |
+
TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
|
4 |
+
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-13b/result.json" --batch_size 2 > models/llama2/llama2-13b/harness.out 2> models/llama2/llama2-13b/harness.err
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b/result.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"jsquad-1.1-0.3": {
|
4 |
+
"exact_match": 76.13687528140477,
|
5 |
+
"f1": 86.24170446058177
|
6 |
+
},
|
7 |
+
"jcommonsenseqa-1.1-0.3": {
|
8 |
+
"acc": 0.7488829311885612,
|
9 |
+
"acc_stderr": 0.012969528294765333,
|
10 |
+
"acc_norm": 0.40035746201966044,
|
11 |
+
"acc_norm_stderr": 0.014653766897279888
|
12 |
+
},
|
13 |
+
"jnli-1.1-0.3": {
|
14 |
+
"acc": 0.2198027937551356,
|
15 |
+
"acc_stderr": 0.008395522792803168,
|
16 |
+
"acc_norm": 0.30156121610517667,
|
17 |
+
"acc_norm_stderr": 0.009304239098715018
|
18 |
+
},
|
19 |
+
"marc_ja-1.1-0.3": {
|
20 |
+
"acc": 0.38892819243013793,
|
21 |
+
"acc_stderr": 0.006483975178620039,
|
22 |
+
"acc_norm": 0.38892819243013793,
|
23 |
+
"acc_norm_stderr": 0.006483975178620039
|
24 |
+
},
|
25 |
+
"jaqket_v2-0.1-0.3": {
|
26 |
+
"exact_match": 67.69759450171821,
|
27 |
+
"f1": 74.62526066907506
|
28 |
+
},
|
29 |
+
"xlsum_ja-1.0-0.3": {
|
30 |
+
"rouge2": 18.110069857141642
|
31 |
+
},
|
32 |
+
"mgsm-1.0-0.3": {
|
33 |
+
"acc": 0.1,
|
34 |
+
"acc_stderr": 0.01901172751573437
|
35 |
+
},
|
36 |
+
"xwinograd_ja": {
|
37 |
+
"acc": 0.6287799791449427,
|
38 |
+
"acc_stderr": 0.015609259235278878
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"versions": {
|
42 |
+
"jsquad-1.1-0.3": 1.1,
|
43 |
+
"jcommonsenseqa-1.1-0.3": 1.1,
|
44 |
+
"jnli-1.1-0.3": 1.1,
|
45 |
+
"marc_ja-1.1-0.3": 1.1,
|
46 |
+
"jaqket_v2-0.1-0.3": 0.1,
|
47 |
+
"xlsum_ja-1.0-0.3": 1.0,
|
48 |
+
"xwinograd_ja": 1.0,
|
49 |
+
"mgsm-1.0-0.3": 1.0
|
50 |
+
},
|
51 |
+
"config": {
|
52 |
+
"model": "hf-causal-experimental",
|
53 |
+
"model_args": "pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True",
|
54 |
+
"num_fewshot": [
|
55 |
+
2,
|
56 |
+
3,
|
57 |
+
3,
|
58 |
+
3,
|
59 |
+
1,
|
60 |
+
1,
|
61 |
+
0,
|
62 |
+
5
|
63 |
+
],
|
64 |
+
"batch_size": 2,
|
65 |
+
"device": "cuda",
|
66 |
+
"no_cache": false,
|
67 |
+
"limit": null,
|
68 |
+
"bootstrap_iters": 100000,
|
69 |
+
"description_dict": {}
|
70 |
+
}
|
71 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/harness.jsquad-1.2.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,dtype=auto"
|
2 |
+
TASK="jsquad-1.2-0.3"
|
3 |
+
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama2/llama2-7b-chat/result.jsquad-1.2.json" --batch_size 2
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/harness.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True"
|
2 |
+
TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
|
3 |
+
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-7b-chat/result.json" --batch_size 2 > models/llama2/llama2-7b-chat/harness.out 2> models/llama2/llama2-7b-chat/harness.err
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/result.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"jsquad-1.1-0.3": {
|
4 |
+
"exact_match": 59.34263845114813,
|
5 |
+
"f1": 73.13860295063034
|
6 |
+
},
|
7 |
+
"jcommonsenseqa-1.1-0.3": {
|
8 |
+
"acc": 0.5558534405719392,
|
9 |
+
"acc_stderr": 0.014860122802670312,
|
10 |
+
"acc_norm": 0.30831099195710454,
|
11 |
+
"acc_norm_stderr": 0.013811124479483027
|
12 |
+
},
|
13 |
+
"jnli-1.1-0.3": {
|
14 |
+
"acc": 0.2953985209531635,
|
15 |
+
"acc_stderr": 0.00924921508921067,
|
16 |
+
"acc_norm": 0.3175842235004108,
|
17 |
+
"acc_norm_stderr": 0.009438064365860652
|
18 |
+
},
|
19 |
+
"marc_ja-1.1-0.3": {
|
20 |
+
"acc": 0.9041386628935267,
|
21 |
+
"acc_stderr": 0.00391561306533889,
|
22 |
+
"acc_norm": 0.9041386628935267,
|
23 |
+
"acc_norm_stderr": 0.00391561306533889
|
24 |
+
},
|
25 |
+
"jaqket_v2-0.1-0.3": {
|
26 |
+
"exact_match": 17.9553264604811,
|
27 |
+
"f1": 31.006768969536488
|
28 |
+
},
|
29 |
+
"xlsum_ja-1.0-0.3": {
|
30 |
+
"rouge2": 2.339856054050597
|
31 |
+
},
|
32 |
+
"mgsm-1.0-0.3": {
|
33 |
+
"acc": 0.092,
|
34 |
+
"acc_stderr": 0.018316275379429644
|
35 |
+
},
|
36 |
+
"xwinograd_ja": {
|
37 |
+
"acc": 0.6611053180396246,
|
38 |
+
"acc_stderr": 0.015292727421996942
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"versions": {
|
42 |
+
"jsquad-1.1-0.3": 1.1,
|
43 |
+
"jcommonsenseqa-1.1-0.3": 1.1,
|
44 |
+
"jnli-1.1-0.3": 1.1,
|
45 |
+
"marc_ja-1.1-0.3": 1.1,
|
46 |
+
"jaqket_v2-0.1-0.3": 0.1,
|
47 |
+
"xlsum_ja-1.0-0.3": 1.0,
|
48 |
+
"xwinograd_ja": 1.0,
|
49 |
+
"mgsm-1.0-0.3": 1.0
|
50 |
+
},
|
51 |
+
"config": {
|
52 |
+
"model": "hf-causal-experimental",
|
53 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True",
|
54 |
+
"num_fewshot": [
|
55 |
+
2,
|
56 |
+
3,
|
57 |
+
3,
|
58 |
+
3,
|
59 |
+
1,
|
60 |
+
1,
|
61 |
+
0,
|
62 |
+
5
|
63 |
+
],
|
64 |
+
"batch_size": 2,
|
65 |
+
"device": "cuda",
|
66 |
+
"no_cache": false,
|
67 |
+
"limit": null,
|
68 |
+
"bootstrap_iters": 100000,
|
69 |
+
"description_dict": {}
|
70 |
+
}
|
71 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/result.jsquad-1.2.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"jsquad-1.2-0.3": {
|
4 |
+
"exact_match": 62.17919855920756,
|
5 |
+
"f1": 74.84345935966519
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"jsquad-1.2-0.3": 1.2
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "hf-causal-experimental",
|
13 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,dtype=auto",
|
14 |
+
"num_fewshot": 2,
|
15 |
+
"batch_size": 2,
|
16 |
+
"device": "cuda",
|
17 |
+
"no_cache": false,
|
18 |
+
"limit": null,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/harness.jsquad-1.2.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,dtype=auto"
|
2 |
+
TASK="jsquad-1.2-0.3"
|
3 |
+
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama2/llama2-7b/result.jsquad-1.2.json" --batch_size 2
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/harness.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True"
|
2 |
+
TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
|
3 |
+
python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-7b/result.json" --batch_size 2 > models/llama2/llama2-7b/harness.out 2> models/llama2/llama2-7b/harness.err
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/result.json
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"jsquad-1.1-0.3": {
|
4 |
+
"exact_match": 58.39711841512832,
|
5 |
+
"f1": 69.52916111780529
|
6 |
+
},
|
7 |
+
"jcommonsenseqa-1.1-0.3": {
|
8 |
+
"acc": 0.5263628239499554,
|
9 |
+
"acc_stderr": 0.014932915029029303,
|
10 |
+
"acc_norm": 0.29222520107238603,
|
11 |
+
"acc_norm_stderr": 0.013601458439195222
|
12 |
+
},
|
13 |
+
"jnli-1.1-0.3": {
|
14 |
+
"acc": 0.28225143796220215,
|
15 |
+
"acc_stderr": 0.009125006713744669,
|
16 |
+
"acc_norm": 0.30156121610517667,
|
17 |
+
"acc_norm_stderr": 0.009304239098715018
|
18 |
+
},
|
19 |
+
"marc_ja-1.1-0.3": {
|
20 |
+
"acc": 0.8604527767951893,
|
21 |
+
"acc_stderr": 0.004608765667738413,
|
22 |
+
"acc_norm": 0.8604527767951893,
|
23 |
+
"acc_norm_stderr": 0.004608765667738413
|
24 |
+
},
|
25 |
+
"jaqket_v2-0.1-0.3": {
|
26 |
+
"exact_match": 38.83161512027491,
|
27 |
+
"f1": 43.653527171568406
|
28 |
+
},
|
29 |
+
"xlsum_ja-1.0-0.3": {
|
30 |
+
"rouge2": 9.32010216666052
|
31 |
+
},
|
32 |
+
"mgsm-1.0-0.3": {
|
33 |
+
"acc": 0.056,
|
34 |
+
"acc_stderr": 0.014570697336899597
|
35 |
+
},
|
36 |
+
"xwinograd_ja": {
|
37 |
+
"acc": 0.6465067778936392,
|
38 |
+
"acc_stderr": 0.015445228301221376
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"versions": {
|
42 |
+
"jsquad-1.1-0.3": 1.1,
|
43 |
+
"jcommonsenseqa-1.1-0.3": 1.1,
|
44 |
+
"jnli-1.1-0.3": 1.1,
|
45 |
+
"marc_ja-1.1-0.3": 1.1,
|
46 |
+
"jaqket_v2-0.1-0.3": 0.1,
|
47 |
+
"xlsum_ja-1.0-0.3": 1.0,
|
48 |
+
"xwinograd_ja": 1.0,
|
49 |
+
"mgsm-1.0-0.3": 1.0
|
50 |
+
},
|
51 |
+
"config": {
|
52 |
+
"model": "hf-causal-experimental",
|
53 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True",
|
54 |
+
"num_fewshot": [
|
55 |
+
2,
|
56 |
+
3,
|
57 |
+
3,
|
58 |
+
3,
|
59 |
+
1,
|
60 |
+
1,
|
61 |
+
0,
|
62 |
+
5
|
63 |
+
],
|
64 |
+
"batch_size": 2,
|
65 |
+
"device": "cuda",
|
66 |
+
"no_cache": false,
|
67 |
+
"limit": null,
|
68 |
+
"bootstrap_iters": 100000,
|
69 |
+
"description_dict": {}
|
70 |
+
}
|
71 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/result.jsquad-1.2.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"jsquad-1.2-0.3": {
|
4 |
+
"exact_match": 59.92796037820801,
|
5 |
+
"f1": 70.8236875084182
|
6 |
+
}
|
7 |
+
},
|
8 |
+
"versions": {
|
9 |
+
"jsquad-1.2-0.3": 1.2
|
10 |
+
},
|
11 |
+
"config": {
|
12 |
+
"model": "hf-causal-experimental",
|
13 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,dtype=auto",
|
14 |
+
"num_fewshot": 2,
|
15 |
+
"batch_size": 2,
|
16 |
+
"device": "cuda",
|
17 |
+
"no_cache": false,
|
18 |
+
"limit": null,
|
19 |
+
"bootstrap_iters": 100000,
|
20 |
+
"description_dict": {}
|
21 |
+
}
|
22 |
+
}
|
scripts/yans/eval/lm-evaluation-harness/models/openai/gpt3/result.mgsm.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
scripts/yans/eval/lm-evaluation-harness/models/stablelm/harness.conf
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[DEFAULT]
|
2 |
+
prompt = 0.3
|
3 |
+
|
4 |
+
[model]
|
5 |
+
# XXX change this to your project dir
|
6 |
+
project_dir = .
|
7 |
+
tokenizer = ${project_dir}/tokenizers/nai-hf-tokenizer/
|
8 |
+
args = use_fast=False
|
scripts/yans/eval/lm-evaluation-harness/models/stablelm/stablelm-jp-3b-ja50_rp50-700b/harness.conf
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[model]
|
2 |
+
path = ${PROJECT_DIR}/hf_model/3b-ja50_rp50-700b
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/anli_r2-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"anli_r2": {"acc": 0.356, "acc_stderr": 0.015149042659306628}}, "versions": {"anli_r2": 0}}
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/arc_challenge-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"arc_challenge": {"acc": 0.24488054607508533, "acc_norm": 0.2440273037542662, "acc_norm_stderr": 0.012551447627856257, "acc_stderr": 0.012566273985131354}}, "versions": {"arc_challenge": 0}}
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
04c3a63a6b3c579bd3775d92b3076ba9130041d5ce7cf9244d3f86e95c804387
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
49edb1e735660631ea6cc309721e6c0b80b7106a613a6959514852ca48f1130e
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
2888d6d098a5ef8c1e7f0d8295ba80826e2e04e431f57508dfb71d53e1cd4604
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
f46cfcc7e43050a235fd2a6b989cabbfbcce76786df74db9f0d4a9cd1caa1628
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
123e2acd00fbba60aba1fbae607c79a062e512c9e79c7d8dfafff63e30111d76
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
d14e4b7fcdd68991eb39b9cf3ade4b37dee9ddd39b688f861d81a327e47a969f
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
6e6add7baff4217f383425bef58288202018e041b24084edcaa5df8af08f820c
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_expletive_it_object_raising-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_expletive_it_object_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_expletive_it_object_raising": 0}}
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
9852b38612db8c6adf938a5d8a7a9e5ce9e655259d6cc806b142506fcaff0ed4
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_only_npi_scope": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_only_npi_scope": 0}}
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_passive_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_1": 0}}
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
38454befedcf1f3f6ef27d3bef9ccfdfb3e94a7ab32d86a63493a920d2d50093
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
80f5f98fad26240de2767fe58c4b18d864df41cbfa76f06c84c3fce9f14f4833
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_superlative_quantifiers_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_superlative_quantifiers_2": 0}}
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/coqa-v0-greedy_until
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
4a8605d5deed0423ec095700251ed93325b45d320aca35d4ce1e94702094435e
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"crows_pairs_english_age": {"likelihood_difference": 0.3160680928470684, "likelihood_difference_stderr": 0.02397758321605678, "pct_stereotype": 0.43956043956043955, "pct_stereotype_stderr": 0.05231815698566189}}, "versions": {"crows_pairs_english_age": 0}}
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_nationality-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
b85bc849811ccfa9971a6ee3fca7342752c314c0cb6f126e10d9ec4d0450c541
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
6f9119026abff33c5c882d6172e092e806a8b21bd86864022978b1961839350f
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"crows_pairs_french_sexual_orientation": {"likelihood_difference": 0.3160680928470684, "likelihood_difference_stderr": 0.02397758321605678, "pct_stereotype": 0.43956043956043955, "pct_stereotype_stderr": 0.05231815698566189}}, "versions": {"crows_pairs_french_sexual_orientation": 0}}
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
8021db8de46850090ddae6e6ec2d382029c3027b7c69884607503f916d09b709
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-high_school_geography": {"acc": 0.2474747474747475, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.03191178226713547, "acc_stderr": 0.03074630074212452}}, "versions": {"hendrycksTest-high_school_geography": 0}}
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
11f40d8f48ba5cd739e21d54c3c04d3761f81df5cb7ddd77df868d24ced44b49
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-high_school_us_history": {"acc": 0.29901960784313725, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.03166009679399814, "acc_stderr": 0.03213325717373618}}, "versions": {"hendrycksTest-high_school_us_history": 0}}
|
scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-human_aging": {"acc": 0.21524663677130046, "acc_norm": 0.17937219730941703, "acc_norm_stderr": 0.025749819569192804, "acc_stderr": 0.02758406660220827}}, "versions": {"hendrycksTest-human_aging": 0}}
|