diff --git "a/sft-410m-eval-files/sft-pythia-410m-5shot-shelloutput.txt" "b/sft-410m-eval-files/sft-pythia-410m-5shot-shelloutput.txt" new file mode 100644--- /dev/null +++ "b/sft-410m-eval-files/sft-pythia-410m-5shot-shelloutput.txt" @@ -0,0 +1,514 @@ +2023-07-29:11:33:11,881 INFO [utils.py:148] Note: NumExpr detected 56 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. +2023-07-29:11:33:11,881 INFO [utils.py:160] NumExpr defaulting to 8 threads. +2023-07-29:11:33:13,402 WARNING [__init__.py:52] Failed to load config in + /home/laura/lm-evaluation-harness_big_refactor/lm_eval/tasks/hendrycks_ethics/utilitarianism_original.yaml + Config will not be added to registry + Error: argument of type 'NoneType' is not iterable +2023-07-29:11:33:13,462 INFO [main.py:135] Selected Tasks: ['arc_challenge', 'arc_easy', 'boolq', 'hellaswag', 'lambada_openai', 'openbookqa', 'piqa', 'sciq', 'wikitext', 'winogrande'] +2023-07-29:11:33:13,525 INFO [huggingface.py:109] Using device 'cuda:0' +2023-07-29:11:33:17,358 WARNING [huggingface.py:246] WARNING: The number of total system GPUs does not match the number of spawned processes. If you would like to use data parallelism, please launch the script with 'accelerate launch *script*'. Current run will proceed with 1 devices. +2023-07-29:11:33:20,762 WARNING [builder.py:816] Found cached dataset ai2_arc (/home/laura/.cache/huggingface/datasets/ai2_arc/ARC-Challenge/1.0.0/1569c2591ea2683779581d9fb467203d9aa95543bb9b75dcfde5da92529fd7f6) + 0%| | 0/3 [00:00 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:20,767 WARNING [task.py:701] Task 'arc_challenge': num_fewshot > 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:22,667 WARNING [builder.py:816] Found cached dataset ai2_arc (/home/laura/.cache/huggingface/datasets/ai2_arc/ARC-Easy/1.0.0/1569c2591ea2683779581d9fb467203d9aa95543bb9b75dcfde5da92529fd7f6) + 0%| | 0/3 [00:00 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:22,671 WARNING [task.py:701] Task 'arc_easy': num_fewshot > 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:22,766 WARNING [task.py:571] metric acc is defined, but aggregation is not. using default aggregation=mean +2023-07-29:11:33:22,766 WARNING [task.py:583] metric acc is defined, but higher_is_better is not. using default higher_is_better=True +2023-07-29:11:33:24,624 WARNING [builder.py:816] Found cached dataset super_glue (/home/laura/.cache/huggingface/datasets/super_glue/boolq/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed) + 0%| | 0/3 [00:00 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:24,627 WARNING [task.py:701] Task 'boolq': num_fewshot > 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:26,729 WARNING [builder.py:816] Found cached dataset hellaswag (/home/laura/.cache/huggingface/datasets/hellaswag/default/0.1.0/512a66dd8b1b1643ab4a48aa4f150d04c91680da6a4096498a5e5f799623d5ae) + 0%| | 0/3 [00:00 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:26,734 WARNING [task.py:701] Task 'hellaswag': num_fewshot > 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:31,046 WARNING [builder.py:816] Found cached dataset lambada_openai (/home/laura/.cache/huggingface/datasets/EleutherAI___lambada_openai/default/1.0.0/57baddecfa09d1790541ef07274c5666abfbe9d2ccd0cd46013cd557b0343095) + 0%| | 0/1 [00:00 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:31,048 WARNING [task.py:302] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended. +2023-07-29:11:33:31,048 WARNING [task.py:701] Task 'lambada_openai': num_fewshot > 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:31,048 WARNING [task.py:302] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended. +2023-07-29:11:33:32,772 WARNING [builder.py:816] Found cached dataset openbookqa (/home/laura/.cache/huggingface/datasets/openbookqa/main/1.0.1/f338ccacfbc86fb8c2de3aa1c06d2ce686933de3bca284dba97d32592c52b33f) + 0%| | 0/3 [00:00 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:32,775 WARNING [task.py:701] Task 'openbookqa': num_fewshot > 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:34,773 WARNING [builder.py:816] Found cached dataset piqa (/home/laura/.cache/huggingface/datasets/piqa/plain_text/1.1.0/6c611c1a9bf220943c4174e117d3b660859665baf1d43156230116185312d011) + 0%| | 0/3 [00:00 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:34,776 WARNING [task.py:701] Task 'piqa': num_fewshot > 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:37,258 WARNING [builder.py:816] Found cached dataset sciq (/home/laura/.cache/huggingface/datasets/sciq/default/0.1.0/50e5c6e3795b55463819d399ec417bfd4c3c621105e00295ddb5f3633d708493) + 0%| | 0/3 [00:00 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:37,262 WARNING [task.py:701] Task 'sciq': num_fewshot > 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:37,682 WARNING [task.py:571] metric word_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity +2023-07-29:11:33:37,682 WARNING [task.py:583] metric word_perplexity is defined, but higher_is_better is not. using default higher_is_better=False +2023-07-29:11:33:37,682 WARNING [task.py:571] metric byte_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity +2023-07-29:11:33:37,682 WARNING [task.py:583] metric byte_perplexity is defined, but higher_is_better is not. using default higher_is_better=False +2023-07-29:11:33:37,682 WARNING [task.py:571] metric bits_per_byte is defined, but aggregation is not. using default aggregation=bits_per_byte +2023-07-29:11:33:37,682 WARNING [task.py:583] metric bits_per_byte is defined, but higher_is_better is not. using default higher_is_better=False +2023-07-29:11:33:39,704 WARNING [builder.py:816] Found cached dataset wikitext_document_level (/home/laura/.cache/huggingface/datasets/EleutherAI___wikitext_document_level/wikitext-2-raw-v1/1.0.0/c7f10a7786444f898dd236db33d4bee9b130f8cbcac690e7bde9b0d027e19fc1) + 0%| | 0/3 [00:00 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:39,707 WARNING [task.py:701] Task 'wikitext': num_fewshot > 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:41,552 WARNING [builder.py:816] Found cached dataset winogrande (/home/laura/.cache/huggingface/datasets/winogrande/winogrande_xl/1.1.0/a826c3d3506aefe0e9e9390dcb53271070536586bab95849876b2c1743df56e2) + 0%| | 0/3 [00:00 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:41,556 WARNING [task.py:701] Task 'winogrande': num_fewshot > 0 but fewshot_split is None. using preconfigured rule. +2023-07-29:11:33:42,560 INFO [task.py:357] Building contexts for task 'arc_challenge' on rank 0... +2023-07-29:11:33:54,836 INFO [evaluator.py:210] Task: arc_challenge; number of requests on this rank: 4687 +2023-07-29:11:33:54,837 INFO [task.py:357] Building contexts for task 'arc_easy' on rank 0... +2023-07-29:11:34:19,682 INFO [evaluator.py:210] Task: arc_easy; number of requests on this rank: 9501 +2023-07-29:11:34:19,682 INFO [task.py:357] Building contexts for task 'boolq' on rank 0... +2023-07-29:11:34:34,039 INFO [evaluator.py:210] Task: boolq; number of requests on this rank: 6540 +2023-07-29:11:34:34,040 INFO [task.py:357] Building contexts for task 'hellaswag' on rank 0... +2023-07-29:11:38:46,649 INFO [evaluator.py:210] Task: hellaswag; number of requests on this rank: 40168 +2023-07-29:11:38:46,651 INFO [task.py:357] Building contexts for task 'lambada_openai' on rank 0... +2023-07-29:11:39:29,240 INFO [evaluator.py:210] Task: lambada_openai; number of requests on this rank: 5153 +2023-07-29:11:39:29,241 INFO [task.py:357] Building contexts for task 'openbookqa' on rank 0... +2023-07-29:11:39:33,131 INFO [evaluator.py:210] Task: openbookqa; number of requests on this rank: 2000 +2023-07-29:11:39:33,131 INFO [task.py:357] Building contexts for task 'piqa' on rank 0... +2023-07-29:11:39:43,911 INFO [evaluator.py:210] Task: piqa; number of requests on this rank: 3676 +2023-07-29:11:39:43,911 INFO [task.py:357] Building contexts for task 'sciq' on rank 0... +2023-07-29:11:39:52,155 INFO [evaluator.py:210] Task: sciq; number of requests on this rank: 4000 +2023-07-29:11:39:52,156 INFO [task.py:357] Building contexts for task 'wikitext' on rank 0... +2023-07-29:11:39:52,638 INFO [evaluator.py:210] Task: wikitext; number of requests on this rank: 62 +2023-07-29:11:39:52,638 INFO [task.py:357] Building contexts for task 'winogrande' on rank 0... +2023-07-29:11:39:52,740 INFO [evaluator.py:210] Task: winogrande; number of requests on this rank: 2534 +2023-07-29:11:39:52,740 INFO [evaluator.py:243] Running loglikelihood requests + 0%| | 0/78259 [00:00", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "word_perplexity" + }, + { + "metric": "byte_perplexity" + }, + { + "metric": "bits_per_byte" + } + ], + "output_type": "loglikelihood_rolling", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "{{page}}" + }, + "winogrande": { + "task": "winogrande", + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "", + "doc_to_target": "", + "doc_to_choice": "", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false + } + }, + "versions": { + "arc_challenge": "Yaml", + "arc_easy": "Yaml", + "boolq": "Yaml", + "hellaswag": "Yaml", + "lambada_openai": "Yaml", + "openbookqa": "Yaml", + "piqa": "Yaml", + "sciq": "Yaml", + "wikitext": "Yaml", + "winogrande": "Yaml" + }, + "config": { + "model": "hf", + "model_args": "pretrained=lomahony/eleuther-pythia410m-hh-sft", + "num_fewshot": 5, + "batch_size": 16, + "batch_sizes": [], + "device": "cuda:0", + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000 + }, + "git_hash": "4e44f0a" +} +hf (pretrained=lomahony/eleuther-pythia410m-hh-sft), limit: None, num_fewshot: 5, batch_size: 16 +| Task |Version|Filter| Metric | Value | |Stderr| +|--------------|-------|------|---------------|------:|---|-----:| +|arc_challenge |Yaml |none |acc | 0.2363|± |0.0124| +| | |none |acc_norm | 0.2696|± |0.0130| +|arc_easy |Yaml |none |acc | 0.5505|± |0.0102| +| | |none |acc_norm | 0.5290|± |0.0102| +|boolq |Yaml |none |acc | 0.5015|± |0.0087| +|hellaswag |Yaml |none |acc | 0.3381|± |0.0047| +| | |none |acc_norm | 0.4011|± |0.0049| +|lambada_openai|Yaml |none |perplexity |14.0923|± |0.4548| +| | |none |acc | 0.4434|± |0.0069| +|openbookqa |Yaml |none |acc | 0.1780|± |0.0171| +| | |none |acc_norm | 0.2880|± |0.0203| +|piqa |Yaml |none |acc | 0.6861|± |0.0108| +| | |none |acc_norm | 0.6844|± |0.0108| +|sciq |Yaml |none |acc | 0.8860|± |0.0101| +| | |none |acc_norm | 0.8790|± |0.0103| +|wikitext |Yaml |none |word_perplexity|35.9679| | | +| | |none |byte_perplexity| 1.8051| | | +| | |none |bits_per_byte | 0.8521| | | +|winogrande |Yaml |none |acc | 0.5328|± |0.0140| +