koichi12 commited on
Commit
08d7121
·
verified ·
1 Parent(s): af928a2

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-3b_1.3.0/result.json +71 -0
  2. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.5.4/harness.sh +15 -0
  3. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/harness.sh +16 -0
  4. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/result.json +48 -0
  5. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/result_2.json +35 -0
  6. scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/xwinograd_ja.result.json +22 -0
  7. scripts/yans/eval/lm-evaluation-harness/models/community/rinna-instruct-1b_0.1.0/harness.sh +12 -0
  8. scripts/yans/eval/lm-evaluation-harness/models/community/rinna-instruct-1b_0.1.0/result.json +71 -0
  9. scripts/yans/eval/lm-evaluation-harness/models/harness.conf +29 -0
  10. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b-chat/harness.sh +3 -0
  11. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b-chat/result.json +71 -0
  12. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b/harness.sh +4 -0
  13. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b/result.json +71 -0
  14. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/harness.jsquad-1.2.sh +3 -0
  15. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/harness.sh +3 -0
  16. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/result.json +71 -0
  17. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/result.jsquad-1.2.json +22 -0
  18. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/harness.jsquad-1.2.sh +3 -0
  19. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/harness.sh +3 -0
  20. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/result.json +71 -0
  21. scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/result.jsquad-1.2.json +22 -0
  22. scripts/yans/eval/lm-evaluation-harness/models/openai/gpt3/result.mgsm.json +0 -0
  23. scripts/yans/eval/lm-evaluation-harness/models/stablelm/harness.conf +8 -0
  24. scripts/yans/eval/lm-evaluation-harness/models/stablelm/stablelm-jp-3b-ja50_rp50-700b/harness.conf +2 -0
  25. scripts/yans/eval/lm-evaluation-harness/tests/testdata/anli_r2-v0-res.json +1 -0
  26. scripts/yans/eval/lm-evaluation-harness/tests/testdata/arc_challenge-v0-res.json +1 -0
  27. scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-loglikelihood +1 -0
  28. scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-loglikelihood +1 -0
  29. scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-loglikelihood +1 -0
  30. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-loglikelihood +1 -0
  31. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood +1 -0
  32. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood +1 -0
  33. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood +1 -0
  34. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_expletive_it_object_raising-v0-res.json +1 -0
  35. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood +1 -0
  36. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-res.json +1 -0
  37. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-res.json +1 -0
  38. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood +1 -0
  39. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood +1 -0
  40. scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json +1 -0
  41. scripts/yans/eval/lm-evaluation-harness/tests/testdata/coqa-v0-greedy_until +1 -0
  42. scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-res.json +1 -0
  43. scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_nationality-v0-loglikelihood +1 -0
  44. scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-loglikelihood +1 -0
  45. scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-res.json +1 -0
  46. scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-loglikelihood +1 -0
  47. scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-res.json +1 -0
  48. scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood +1 -0
  49. scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json +1 -0
  50. scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-res.json +1 -0
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-3b_1.3.0/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.3": {
4
+ "acc": 0.709562109025916,
5
+ "acc_stderr": 0.013576910133903362,
6
+ "acc_norm": 0.646112600536193,
7
+ "acc_norm_stderr": 0.014300978485999566
8
+ },
9
+ "jnli-1.1-0.3": {
10
+ "acc": 0.32251437962202134,
11
+ "acc_stderr": 0.009476621341954925,
12
+ "acc_norm": 0.2933442892358258,
13
+ "acc_norm_stderr": 0.009230425066070979
14
+ },
15
+ "marc_ja-1.1-0.3": {
16
+ "acc": 0.8955354028601326,
17
+ "acc_stderr": 0.00403956848782294,
18
+ "acc_norm": 0.8955354028601326,
19
+ "acc_norm_stderr": 0.00403956848782294
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6392075078206465,
23
+ "acc_stderr": 0.015515541059528105
24
+ },
25
+ "jsquad-1.1-0.3": {
26
+ "exact_match": 32.37280504277353,
27
+ "f1": 41.27834864657371
28
+ },
29
+ "jaqket_v2-0.1-0.3": {
30
+ "exact_match": 32.302405498281786,
31
+ "f1": 37.16938035237004
32
+ },
33
+ "xlsum_ja-1.0-0.3": {
34
+ "rouge2": 0.3661308935912556
35
+ },
36
+ "mgsm-1.0-0.3": {
37
+ "acc": 0.012,
38
+ "acc_stderr": 0.006900323023694269
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.3": 1.1,
43
+ "jnli-1.1-0.3": 1.1,
44
+ "marc_ja-1.1-0.3": 1.1,
45
+ "jsquad-1.1-0.3": 1.1,
46
+ "jaqket_v2-0.1-0.3": 0.1,
47
+ "xlsum_ja-1.0-0.3": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm-1.0-0.3": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=${PROJECT_DIR}/instruction_tuning/outputs/open-calm-instruct-3b_1.3.0,tokenizer=cyberagent/open-calm-3b",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.5.4/harness.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ PRETRAINED="${PROJECT_DIR}/sft/checkpoints/open-calm-instruct-7b_1.5.4/"
4
+ TOKENIZER="cyberagent/open-calm-7b"
5
+ MODEL_ARGS="pretrained=${PRETRAINED},tokenizer=${TOKENIZER}"
6
+ TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
7
+ NUM_FEWSHOT="2,3,3,3"
8
+ OUTPUT_PATH="models/community/cyberagent-open-calm-instruct-7b_1.5.4/result.json"
9
+ python main.py \
10
+ --model hf-causal \
11
+ --model_args $MODEL_ARGS \
12
+ --tasks $TASK \
13
+ --num_fewshot $NUM_FEWSHOT \
14
+ --device "cuda" \
15
+ --output_path $OUTPUT_PATH
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/harness.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ PROJECT_DIR=""
4
+ PRETRAINED="${PROJECT_DIR}/sft/models/open-calm-instruct-7b_1.9.4/"
5
+ TOKENIZER="cyberagent/open-calm-7b"
6
+ MODEL_ARGS="pretrained=${PRETRAINED},tokenizer=${TOKENIZER}"
7
+ TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3"
8
+ NUM_FEWSHOT="2,3,3,3"
9
+ OUTPUT_PATH="models/community/cyberagent-open-calm-instruct-7b_1.9.4/result.json"
10
+ python main.py \
11
+ --model hf-causal \
12
+ --model_args $MODEL_ARGS \
13
+ --tasks $TASK \
14
+ --num_fewshot $NUM_FEWSHOT \
15
+ --device "cuda" \
16
+ --output_path $OUTPUT_PATH
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/result.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.1-0.3": {
4
+ "exact_match": 35.862224223322826,
5
+ "f1": 46.264251886156444
6
+ },
7
+ "jcommonsenseqa-1.1-0.3": {
8
+ "acc": 0.7220732797140303,
9
+ "acc_stderr": 0.0133978430711737,
10
+ "acc_norm": 0.6586237712243074,
11
+ "acc_norm_stderr": 0.014181247513525484
12
+ },
13
+ "jnli-1.1-0.3": {
14
+ "acc": 0.3985209531635168,
15
+ "acc_stderr": 0.00992578301888985,
16
+ "acc_norm": 0.3742810188989318,
17
+ "acc_norm_stderr": 0.00981109569636444
18
+ },
19
+ "marc_ja-1.1-0.3": {
20
+ "acc": 0.8636363636363636,
21
+ "acc_stderr": 0.004564311271779641,
22
+ "acc_norm": 0.8636363636363636,
23
+ "acc_norm_stderr": 0.004564311271779641
24
+ }
25
+ },
26
+ "versions": {
27
+ "jsquad-1.1-0.3": 1.1,
28
+ "jcommonsenseqa-1.1-0.3": 1.1,
29
+ "jnli-1.1-0.3": 1.1,
30
+ "marc_ja-1.1-0.3": 1.1
31
+ },
32
+ "config": {
33
+ "model": "hf-causal",
34
+ "model_args": "pretrained=sft/models/open-calm-instruct-7b_1.9.4/,tokenizer=cyberagent/open-calm-7b",
35
+ "num_fewshot": [
36
+ 2,
37
+ 3,
38
+ 3,
39
+ 3
40
+ ],
41
+ "batch_size": null,
42
+ "device": "cuda",
43
+ "no_cache": false,
44
+ "limit": null,
45
+ "bootstrap_iters": 100000,
46
+ "description_dict": {}
47
+ }
48
+ }
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/result_2.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jaqket_v2-0.1-0.3": {
4
+ "exact_match": 40.292096219931274,
5
+ "f1": 45.46446299023617
6
+ },
7
+ "xlsum_ja-1.0-0.3": {
8
+ "rouge2": 1.0537473567422466
9
+ },
10
+ "mgsm-1.0-0.3": {
11
+ "acc": 0.012,
12
+ "acc_stderr": 0.006900323023694277
13
+ }
14
+ },
15
+ "versions": {
16
+ "jaqket_v2-0.1-0.3": 0.1,
17
+ "xlsum_ja-1.0-0.3": 1.0,
18
+ "mgsm-1.0-0.3": 1.0
19
+ },
20
+ "config": {
21
+ "model": "hf-causal",
22
+ "model_args": "pretrained=sft/checkpoints/open-calm-instruct-7b_1.9.4/,tokenizer=cyberagent/open-calm-7b",
23
+ "num_fewshot": [
24
+ 1,
25
+ 1,
26
+ 4
27
+ ],
28
+ "batch_size": null,
29
+ "device": "cuda",
30
+ "no_cache": false,
31
+ "limit": null,
32
+ "bootstrap_iters": 100000,
33
+ "description_dict": {}
34
+ }
35
+ }
scripts/yans/eval/lm-evaluation-harness/models/community/cyberagent-open-calm-instruct-7b_1.9.4/xwinograd_ja.result.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "xwinograd_ja": {
4
+ "acc": 0.6517205422314911,
5
+ "acc_stderr": 0.015392596336826887
6
+ }
7
+ },
8
+ "versions": {
9
+ "xwinograd_ja": 1.0
10
+ },
11
+ "config": {
12
+ "model": "hf-causal",
13
+ "model_args": "pretrained=sft/checkpoints/open-calm-instruct-7b_1.9.4/,tokenizer=cyberagent/open-calm-7b",
14
+ "num_fewshot": 0,
15
+ "batch_size": null,
16
+ "device": "cuda",
17
+ "no_cache": false,
18
+ "limit": null,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
scripts/yans/eval/lm-evaluation-harness/models/community/rinna-instruct-1b_0.1.0/harness.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eu
3
+ PROJECT_DIR=""
4
+ MODEL_ARGS="pretrained=${PROJECT_DIR}/instruction_tuning/outputs/rinna-instruct-1b_0.1.0/,tokenizer=rinna/japanese-gpt-1b,use_fast=False"
5
+ TASK="jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jsquad-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
6
+ python main.py \
7
+ --model hf-causal \
8
+ --model_args $MODEL_ARGS \
9
+ --tasks $TASK \
10
+ --num_fewshot "3,3,3,2,1,1,0,5" \
11
+ --device "cuda" \
12
+ --output_path "models/rinna/rinna-instruct-1b_0.1.0/result.json"
scripts/yans/eval/lm-evaluation-harness/models/community/rinna-instruct-1b_0.1.0/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jcommonsenseqa-1.1-0.3": {
4
+ "acc": 0.6175156389633601,
5
+ "acc_stderr": 0.014534828771100699,
6
+ "acc_norm": 0.5120643431635389,
7
+ "acc_norm_stderr": 0.014949361502212136
8
+ },
9
+ "jnli-1.1-0.3": {
10
+ "acc": 0.2921117502054232,
11
+ "acc_stderr": 0.009219042365016429,
12
+ "acc_norm": 0.25965488907148726,
13
+ "acc_norm_stderr": 0.00888882393679571
14
+ },
15
+ "marc_ja-1.1-0.3": {
16
+ "acc": 0.788280432507848,
17
+ "acc_stderr": 0.005395477692275257,
18
+ "acc_norm": 0.788280432507848,
19
+ "acc_norm_stderr": 0.005395477692275257
20
+ },
21
+ "xwinograd_ja": {
22
+ "acc": 0.6496350364963503,
23
+ "acc_stderr": 0.015413891595766074
24
+ },
25
+ "jsquad-1.1-0.3": {
26
+ "exact_match": 17.086897793786584,
27
+ "f1": 26.487150700412975
28
+ },
29
+ "jaqket_v2-0.1-0.3": {
30
+ "exact_match": 20.876288659793815,
31
+ "f1": 27.990153222111978
32
+ },
33
+ "xlsum_ja-1.0-0.3": {
34
+ "rouge2": 1.3375721893967882
35
+ },
36
+ "mgsm-1.0-0.3": {
37
+ "acc": 0.012,
38
+ "acc_stderr": 0.006900323023694273
39
+ }
40
+ },
41
+ "versions": {
42
+ "jcommonsenseqa-1.1-0.3": 1.1,
43
+ "jnli-1.1-0.3": 1.1,
44
+ "marc_ja-1.1-0.3": 1.1,
45
+ "jsquad-1.1-0.3": 1.1,
46
+ "jaqket_v2-0.1-0.3": 0.1,
47
+ "xlsum_ja-1.0-0.3": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm-1.0-0.3": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal",
53
+ "model_args": "pretrained=${PROJECT_DIR}/instruction_tuning/outputs/rinna-instruct-1b_0.1.0/,tokenizer=rinna/japanese-gpt-1b,use_fast=False",
54
+ "num_fewshot": [
55
+ 3,
56
+ 3,
57
+ 3,
58
+ 2,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": null,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/harness.conf ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ # the --model option to the training script
3
+ model = hf-causal
4
+
5
+ [tasks.jcommonsenseqa-1.1]
6
+ fewshot = 3
7
+
8
+ [tasks.jnli]
9
+ fewshot = 3
10
+
11
+ [tasks.marc_ja]
12
+ fewshot = 3
13
+
14
+ [tasks.jsquad-1.1]
15
+ fewshot = 2
16
+
17
+ [tasks.jaqket_v2-0.1]
18
+ fewshot = 1
19
+
20
+ [tasks.xlsum_ja]
21
+ fewshot = 1
22
+
23
+ [tasks.xwinograd_ja]
24
+ fewshot = 0
25
+ # This specifically has no prompt
26
+ prompt = ""
27
+
28
+ [tasks.mgsm]
29
+ fewshot = 5
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b-chat/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=meta-llama/Llama-2-13b-chat-hf,use_accelerate=True"
2
+ TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
3
+ python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-13b-chat/result.json" --batch_size 2 > models/llama2/llama2-13b-chat/harness.out 2> models/llama2/llama2-13b-chat/harness.err
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b-chat/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.1-0.3": {
4
+ "exact_match": 67.69473210265646,
5
+ "f1": 82.68867939081463
6
+ },
7
+ "jcommonsenseqa-1.1-0.3": {
8
+ "acc": 0.7256478999106345,
9
+ "acc_stderr": 0.01334431281465833,
10
+ "acc_norm": 0.3967828418230563,
11
+ "acc_norm_stderr": 0.01463161897855815
12
+ },
13
+ "jnli-1.1-0.3": {
14
+ "acc": 0.3562037797863599,
15
+ "acc_stderr": 0.009708506341194316,
16
+ "acc_norm": 0.3648315529991783,
17
+ "acc_norm_stderr": 0.00975932091977734
18
+ },
19
+ "marc_ja-1.1-0.3": {
20
+ "acc": 0.5992217898832685,
21
+ "acc_stderr": 0.006517879943818406,
22
+ "acc_norm": 0.5992217898832685,
23
+ "acc_norm_stderr": 0.006517879943818406
24
+ },
25
+ "jaqket_v2-0.1-0.3": {
26
+ "exact_match": 48.1958762886598,
27
+ "f1": 63.75233331776556
28
+ },
29
+ "xlsum_ja-1.0-0.3": {
30
+ "rouge2": 15.14282905950018
31
+ },
32
+ "mgsm-1.0-0.3": {
33
+ "acc": 0.132,
34
+ "acc_stderr": 0.021450980824038107
35
+ },
36
+ "xwinograd_ja": {
37
+ "acc": 0.6381647549530761,
38
+ "acc_stderr": 0.015525267319875928
39
+ }
40
+ },
41
+ "versions": {
42
+ "jsquad-1.1-0.3": 1.1,
43
+ "jcommonsenseqa-1.1-0.3": 1.1,
44
+ "jnli-1.1-0.3": 1.1,
45
+ "marc_ja-1.1-0.3": 1.1,
46
+ "jaqket_v2-0.1-0.3": 0.1,
47
+ "xlsum_ja-1.0-0.3": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm-1.0-0.3": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal-experimental",
53
+ "model_args": "pretrained=meta-llama/Llama-2-13b-chat-hf,use_accelerate=True",
54
+ "num_fewshot": [
55
+ 2,
56
+ 3,
57
+ 3,
58
+ 3,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": 2,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b/harness.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ MODEL_ARGS="pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True"
2
+
3
+ TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
4
+ python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-13b/result.json" --batch_size 2 > models/llama2/llama2-13b/harness.out 2> models/llama2/llama2-13b/harness.err
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-13b/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.1-0.3": {
4
+ "exact_match": 76.13687528140477,
5
+ "f1": 86.24170446058177
6
+ },
7
+ "jcommonsenseqa-1.1-0.3": {
8
+ "acc": 0.7488829311885612,
9
+ "acc_stderr": 0.012969528294765333,
10
+ "acc_norm": 0.40035746201966044,
11
+ "acc_norm_stderr": 0.014653766897279888
12
+ },
13
+ "jnli-1.1-0.3": {
14
+ "acc": 0.2198027937551356,
15
+ "acc_stderr": 0.008395522792803168,
16
+ "acc_norm": 0.30156121610517667,
17
+ "acc_norm_stderr": 0.009304239098715018
18
+ },
19
+ "marc_ja-1.1-0.3": {
20
+ "acc": 0.38892819243013793,
21
+ "acc_stderr": 0.006483975178620039,
22
+ "acc_norm": 0.38892819243013793,
23
+ "acc_norm_stderr": 0.006483975178620039
24
+ },
25
+ "jaqket_v2-0.1-0.3": {
26
+ "exact_match": 67.69759450171821,
27
+ "f1": 74.62526066907506
28
+ },
29
+ "xlsum_ja-1.0-0.3": {
30
+ "rouge2": 18.110069857141642
31
+ },
32
+ "mgsm-1.0-0.3": {
33
+ "acc": 0.1,
34
+ "acc_stderr": 0.01901172751573437
35
+ },
36
+ "xwinograd_ja": {
37
+ "acc": 0.6287799791449427,
38
+ "acc_stderr": 0.015609259235278878
39
+ }
40
+ },
41
+ "versions": {
42
+ "jsquad-1.1-0.3": 1.1,
43
+ "jcommonsenseqa-1.1-0.3": 1.1,
44
+ "jnli-1.1-0.3": 1.1,
45
+ "marc_ja-1.1-0.3": 1.1,
46
+ "jaqket_v2-0.1-0.3": 0.1,
47
+ "xlsum_ja-1.0-0.3": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm-1.0-0.3": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal-experimental",
53
+ "model_args": "pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True",
54
+ "num_fewshot": [
55
+ 2,
56
+ 3,
57
+ 3,
58
+ 3,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": 2,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/harness.jsquad-1.2.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,dtype=auto"
2
+ TASK="jsquad-1.2-0.3"
3
+ python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama2/llama2-7b-chat/result.jsquad-1.2.json" --batch_size 2
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True"
2
+ TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
3
+ python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-7b-chat/result.json" --batch_size 2 > models/llama2/llama2-7b-chat/harness.out 2> models/llama2/llama2-7b-chat/harness.err
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.1-0.3": {
4
+ "exact_match": 59.34263845114813,
5
+ "f1": 73.13860295063034
6
+ },
7
+ "jcommonsenseqa-1.1-0.3": {
8
+ "acc": 0.5558534405719392,
9
+ "acc_stderr": 0.014860122802670312,
10
+ "acc_norm": 0.30831099195710454,
11
+ "acc_norm_stderr": 0.013811124479483027
12
+ },
13
+ "jnli-1.1-0.3": {
14
+ "acc": 0.2953985209531635,
15
+ "acc_stderr": 0.00924921508921067,
16
+ "acc_norm": 0.3175842235004108,
17
+ "acc_norm_stderr": 0.009438064365860652
18
+ },
19
+ "marc_ja-1.1-0.3": {
20
+ "acc": 0.9041386628935267,
21
+ "acc_stderr": 0.00391561306533889,
22
+ "acc_norm": 0.9041386628935267,
23
+ "acc_norm_stderr": 0.00391561306533889
24
+ },
25
+ "jaqket_v2-0.1-0.3": {
26
+ "exact_match": 17.9553264604811,
27
+ "f1": 31.006768969536488
28
+ },
29
+ "xlsum_ja-1.0-0.3": {
30
+ "rouge2": 2.339856054050597
31
+ },
32
+ "mgsm-1.0-0.3": {
33
+ "acc": 0.092,
34
+ "acc_stderr": 0.018316275379429644
35
+ },
36
+ "xwinograd_ja": {
37
+ "acc": 0.6611053180396246,
38
+ "acc_stderr": 0.015292727421996942
39
+ }
40
+ },
41
+ "versions": {
42
+ "jsquad-1.1-0.3": 1.1,
43
+ "jcommonsenseqa-1.1-0.3": 1.1,
44
+ "jnli-1.1-0.3": 1.1,
45
+ "marc_ja-1.1-0.3": 1.1,
46
+ "jaqket_v2-0.1-0.3": 0.1,
47
+ "xlsum_ja-1.0-0.3": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm-1.0-0.3": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal-experimental",
53
+ "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True",
54
+ "num_fewshot": [
55
+ 2,
56
+ 3,
57
+ 3,
58
+ 3,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": 2,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b-chat/result.jsquad-1.2.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.2-0.3": {
4
+ "exact_match": 62.17919855920756,
5
+ "f1": 74.84345935966519
6
+ }
7
+ },
8
+ "versions": {
9
+ "jsquad-1.2-0.3": 1.2
10
+ },
11
+ "config": {
12
+ "model": "hf-causal-experimental",
13
+ "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,dtype=auto",
14
+ "num_fewshot": 2,
15
+ "batch_size": 2,
16
+ "device": "cuda",
17
+ "no_cache": false,
18
+ "limit": null,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/harness.jsquad-1.2.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,dtype=auto"
2
+ TASK="jsquad-1.2-0.3"
3
+ python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama2/llama2-7b/result.jsquad-1.2.json" --batch_size 2
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/harness.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True"
2
+ TASK="jsquad-1.1-0.3,jcommonsenseqa-1.1-0.3,jnli-1.1-0.3,marc_ja-1.1-0.3,jaqket_v2-0.1-0.3,xlsum_ja-1.0-0.3,xwinograd_ja,mgsm-1.0-0.3"
3
+ python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2,3,3,3,1,1,0,5" --device "cuda" --output_path "models/llama2/llama2-7b/result.json" --batch_size 2 > models/llama2/llama2-7b/harness.out 2> models/llama2/llama2-7b/harness.err
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/result.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.1-0.3": {
4
+ "exact_match": 58.39711841512832,
5
+ "f1": 69.52916111780529
6
+ },
7
+ "jcommonsenseqa-1.1-0.3": {
8
+ "acc": 0.5263628239499554,
9
+ "acc_stderr": 0.014932915029029303,
10
+ "acc_norm": 0.29222520107238603,
11
+ "acc_norm_stderr": 0.013601458439195222
12
+ },
13
+ "jnli-1.1-0.3": {
14
+ "acc": 0.28225143796220215,
15
+ "acc_stderr": 0.009125006713744669,
16
+ "acc_norm": 0.30156121610517667,
17
+ "acc_norm_stderr": 0.009304239098715018
18
+ },
19
+ "marc_ja-1.1-0.3": {
20
+ "acc": 0.8604527767951893,
21
+ "acc_stderr": 0.004608765667738413,
22
+ "acc_norm": 0.8604527767951893,
23
+ "acc_norm_stderr": 0.004608765667738413
24
+ },
25
+ "jaqket_v2-0.1-0.3": {
26
+ "exact_match": 38.83161512027491,
27
+ "f1": 43.653527171568406
28
+ },
29
+ "xlsum_ja-1.0-0.3": {
30
+ "rouge2": 9.32010216666052
31
+ },
32
+ "mgsm-1.0-0.3": {
33
+ "acc": 0.056,
34
+ "acc_stderr": 0.014570697336899597
35
+ },
36
+ "xwinograd_ja": {
37
+ "acc": 0.6465067778936392,
38
+ "acc_stderr": 0.015445228301221376
39
+ }
40
+ },
41
+ "versions": {
42
+ "jsquad-1.1-0.3": 1.1,
43
+ "jcommonsenseqa-1.1-0.3": 1.1,
44
+ "jnli-1.1-0.3": 1.1,
45
+ "marc_ja-1.1-0.3": 1.1,
46
+ "jaqket_v2-0.1-0.3": 0.1,
47
+ "xlsum_ja-1.0-0.3": 1.0,
48
+ "xwinograd_ja": 1.0,
49
+ "mgsm-1.0-0.3": 1.0
50
+ },
51
+ "config": {
52
+ "model": "hf-causal-experimental",
53
+ "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True",
54
+ "num_fewshot": [
55
+ 2,
56
+ 3,
57
+ 3,
58
+ 3,
59
+ 1,
60
+ 1,
61
+ 0,
62
+ 5
63
+ ],
64
+ "batch_size": 2,
65
+ "device": "cuda",
66
+ "no_cache": false,
67
+ "limit": null,
68
+ "bootstrap_iters": 100000,
69
+ "description_dict": {}
70
+ }
71
+ }
scripts/yans/eval/lm-evaluation-harness/models/llama2/llama2-7b/result.jsquad-1.2.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "jsquad-1.2-0.3": {
4
+ "exact_match": 59.92796037820801,
5
+ "f1": 70.8236875084182
6
+ }
7
+ },
8
+ "versions": {
9
+ "jsquad-1.2-0.3": 1.2
10
+ },
11
+ "config": {
12
+ "model": "hf-causal-experimental",
13
+ "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,dtype=auto",
14
+ "num_fewshot": 2,
15
+ "batch_size": 2,
16
+ "device": "cuda",
17
+ "no_cache": false,
18
+ "limit": null,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
scripts/yans/eval/lm-evaluation-harness/models/openai/gpt3/result.mgsm.json ADDED
The diff for this file is too large to render. See raw diff
 
scripts/yans/eval/lm-evaluation-harness/models/stablelm/harness.conf ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [DEFAULT]
2
+ prompt = 0.3
3
+
4
+ [model]
5
+ # XXX change this to your project dir
6
+ project_dir = .
7
+ tokenizer = ${project_dir}/tokenizers/nai-hf-tokenizer/
8
+ args = use_fast=False
scripts/yans/eval/lm-evaluation-harness/models/stablelm/stablelm-jp-3b-ja50_rp50-700b/harness.conf ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [model]
2
+ path = ${PROJECT_DIR}/hf_model/3b-ja50_rp50-700b
scripts/yans/eval/lm-evaluation-harness/tests/testdata/anli_r2-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"anli_r2": {"acc": 0.356, "acc_stderr": 0.015149042659306628}}, "versions": {"anli_r2": 0}}
scripts/yans/eval/lm-evaluation-harness/tests/testdata/arc_challenge-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"arc_challenge": {"acc": 0.24488054607508533, "acc_norm": 0.2440273037542662, "acc_norm_stderr": 0.012551447627856257, "acc_stderr": 0.012566273985131354}}, "versions": {"arc_challenge": 0}}
scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 04c3a63a6b3c579bd3775d92b3076ba9130041d5ce7cf9244d3f86e95c804387
scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 49edb1e735660631ea6cc309721e6c0b80b7106a613a6959514852ca48f1130e
scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 2888d6d098a5ef8c1e7f0d8295ba80826e2e04e431f57508dfb71d53e1cd4604
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ f46cfcc7e43050a235fd2a6b989cabbfbcce76786df74db9f0d4a9cd1caa1628
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 123e2acd00fbba60aba1fbae607c79a062e512c9e79c7d8dfafff63e30111d76
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ d14e4b7fcdd68991eb39b9cf3ade4b37dee9ddd39b688f861d81a327e47a969f
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 6e6add7baff4217f383425bef58288202018e041b24084edcaa5df8af08f820c
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_expletive_it_object_raising-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_expletive_it_object_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_expletive_it_object_raising": 0}}
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 9852b38612db8c6adf938a5d8a7a9e5ce9e655259d6cc806b142506fcaff0ed4
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_only_npi_scope": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_only_npi_scope": 0}}
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_passive_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_1": 0}}
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 38454befedcf1f3f6ef27d3bef9ccfdfb3e94a7ab32d86a63493a920d2d50093
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 80f5f98fad26240de2767fe58c4b18d864df41cbfa76f06c84c3fce9f14f4833
scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_superlative_quantifiers_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_superlative_quantifiers_2": 0}}
scripts/yans/eval/lm-evaluation-harness/tests/testdata/coqa-v0-greedy_until ADDED
@@ -0,0 +1 @@
 
 
1
+ 4a8605d5deed0423ec095700251ed93325b45d320aca35d4ce1e94702094435e
scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"crows_pairs_english_age": {"likelihood_difference": 0.3160680928470684, "likelihood_difference_stderr": 0.02397758321605678, "pct_stereotype": 0.43956043956043955, "pct_stereotype_stderr": 0.05231815698566189}}, "versions": {"crows_pairs_english_age": 0}}
scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_nationality-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ b85bc849811ccfa9971a6ee3fca7342752c314c0cb6f126e10d9ec4d0450c541
scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 6f9119026abff33c5c882d6172e092e806a8b21bd86864022978b1961839350f
scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"crows_pairs_french_sexual_orientation": {"likelihood_difference": 0.3160680928470684, "likelihood_difference_stderr": 0.02397758321605678, "pct_stereotype": 0.43956043956043955, "pct_stereotype_stderr": 0.05231815698566189}}, "versions": {"crows_pairs_french_sexual_orientation": 0}}
scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 8021db8de46850090ddae6e6ec2d382029c3027b7c69884607503f916d09b709
scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-high_school_geography": {"acc": 0.2474747474747475, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.03191178226713547, "acc_stderr": 0.03074630074212452}}, "versions": {"hendrycksTest-high_school_geography": 0}}
scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 11f40d8f48ba5cd739e21d54c3c04d3761f81df5cb7ddd77df868d24ced44b49
scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-high_school_us_history": {"acc": 0.29901960784313725, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.03166009679399814, "acc_stderr": 0.03213325717373618}}, "versions": {"hendrycksTest-high_school_us_history": 0}}
scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-human_aging": {"acc": 0.21524663677130046, "acc_norm": 0.17937219730941703, "acc_norm_stderr": 0.025749819569192804, "acc_stderr": 0.02758406660220827}}, "versions": {"hendrycksTest-human_aging": 0}}