diff --git a/scripts/decode/en-ja/llama2/beam_search.sh b/scripts/decode/en-ja/llama2/beam_search.sh new file mode 100644 index 0000000000000000000000000000000000000000..8cf0818befb9ee56e05db71bbbf264db248fe4c5 --- /dev/null +++ b/scripts/decode/en-ja/llama2/beam_search.sh @@ -0,0 +1,19 @@ +set -eux +LLM_RECIPES_DIR=/code/llm-recipes +source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh + +MAX_INPUT_TOKENS=158 +BEAM_SIZE=50 + +python /code/llm-recipes/tools/hf_inference_distrubuted.py \ + --model /work/models/additiona_trained_hf/llama2-en-ja-continuous-pretrained-v0-dev-finetune-chunked-docs-all-averaged-841-845 \ + -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \ + -o /work/translation/wmt2024_test/en-ja/llama2-beam \ + -g 0 1 2 3 4 5 6 7 \ + --attn_implementation sdpa \ + --dynamic_max_new_token_ratio 3.0 \ + --num_return_sequences ${BEAM_SIZE} \ + --num_beams ${BEAM_SIZE} \ + --max_input_tokens ${MAX_INPUT_TOKENS} \ + -b 158 + diff --git a/scripts/decode/en-ja/llama2/greedy_inference.sh b/scripts/decode/en-ja/llama2/greedy_inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..1fe3cb36a1b5ed6964b442053e94c2b5365775a7 --- /dev/null +++ b/scripts/decode/en-ja/llama2/greedy_inference.sh @@ -0,0 +1,13 @@ +LLM_RECIPES_DIR=/code/llm-recipes +source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh + +python /code/llm-recipes/tools/hf_inference.py \ + --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \ + -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \ + -o /work/translation/wmt24_test/en-ja/mistral-greedy \ + -g 0 \ + -b 4096 \ + --dynamic_max_new_token_ratio 3.0 + +echo "Done!" + diff --git a/scripts/decode/en-ja/llama2/hf_inference.sh b/scripts/decode/en-ja/llama2/hf_inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..1fe3cb36a1b5ed6964b442053e94c2b5365775a7 --- /dev/null +++ b/scripts/decode/en-ja/llama2/hf_inference.sh @@ -0,0 +1,13 @@ +LLM_RECIPES_DIR=/code/llm-recipes +source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh + +python /code/llm-recipes/tools/hf_inference.py \ + --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \ + -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \ + -o /work/translation/wmt24_test/en-ja/mistral-greedy \ + -g 0 \ + -b 4096 \ + --dynamic_max_new_token_ratio 3.0 + +echo "Done!" + diff --git a/scripts/decode/en-ja/llama2/top_p_inference.sh b/scripts/decode/en-ja/llama2/top_p_inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..f5315754a87cc492924da6d16fc5421401199cbe --- /dev/null +++ b/scripts/decode/en-ja/llama2/top_p_inference.sh @@ -0,0 +1,17 @@ +set -eux +LLM_RECIPES_DIR=/code/llm-recipes +source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh + +i=4 +GPU_ID=4 +python /code/llm-recipes/tools/hf_inference.py \ + --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \ + -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i} \ + -o /work/translation/wmt24_test/en-ja/mistral-top-p-0.95/split_0${i} \ + -g ${GPU_ID} \ + -b 500 \ + --attn_implementation sdpa \ + --dynamic_max_new_token_ratio 3.0 \ + --num_return_sequences 100 \ + --do_sample \ + --top_p 0.95 & diff --git a/scripts/decode/en-ja/llama2/top_p_inference_1.sh b/scripts/decode/en-ja/llama2/top_p_inference_1.sh new file mode 100644 index 0000000000000000000000000000000000000000..118db3b3d19f0261b103102bcf8001c9378c232d --- /dev/null +++ b/scripts/decode/en-ja/llama2/top_p_inference_1.sh @@ -0,0 +1,20 @@ +set -eux +LLM_RECIPES_DIR=/code/llm-recipes +source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh + +for i in `seq 0 6`; do + python /code/llm-recipes/tools/hf_inference.py \ + --model /work/models/additiona_trained_hf/llama2-en-ja-continuous-pretrained-v0-dev-finetune-chunked-docs-all-averaged-841-845 \ + -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i} \ + -o /work/translation/wmt24_test/en-ja/llama2-top-p-0.95/split_0${i} \ + -g ${i} \ + -b 158 \ + --attn_implementation sdpa \ + --dynamic_max_new_token_ratio 3.0 \ + --num_return_sequences 50 \ + --do_sample \ + --top_p 0.95 \ + --max_input_tokens 158 & +done +wait + diff --git a/scripts/decode/en-ja/llama2/top_p_inference_2.sh b/scripts/decode/en-ja/llama2/top_p_inference_2.sh new file mode 100644 index 0000000000000000000000000000000000000000..a5e987894a82f07fe482302d15df7546103d9c4a --- /dev/null +++ b/scripts/decode/en-ja/llama2/top_p_inference_2.sh @@ -0,0 +1,21 @@ +set -eux +LLM_RECIPES_DIR=/code/llm-recipes +source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh + +for i in `seq 7 9`; do + GPU_ID=$((i-5)) + python /code/llm-recipes/tools/hf_inference.py \ + --model /work/models/additiona_trained_hf/llama2-en-ja-continuous-pretrained-v0-dev-finetune-chunked-docs-all-averaged-841-845 \ + -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i} \ + -o /work/translation/wmt24_test/en-ja/llama2-top-p-0.95/split_0${i} \ + -g ${GPU_ID} \ + -b 158 \ + --attn_implementation sdpa \ + --dynamic_max_new_token_ratio 3.0 \ + --num_return_sequences 50 \ + --do_sample \ + --top_p 0.95 \ + --max_input_tokens 158 & +done +wait + diff --git a/scripts/decode/en-ja/mistral-ve/top_p_inference.sh b/scripts/decode/en-ja/mistral-ve/top_p_inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..264af26aad7995586bb3e76b555f077963460849 --- /dev/null +++ b/scripts/decode/en-ja/mistral-ve/top_p_inference.sh @@ -0,0 +1,16 @@ +set -eux +LLM_RECIPES_DIR=/code/llm-recipes +source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh + +python /code/llm-recipes/tools/hf_inference_distrubuted.py \ + --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-ve-sim-chunked-docs-all-averaged-596-600 \ + -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \ + -o /work/translation/wmt2024_test/en-ja/mistral-ve-top-p-0.95 \ + -g 0 1 2 3 4 5 6 7 \ + -b 125 \ + --attn_implementation sdpa \ + --dynamic_max_new_token_ratio 2.0 \ + --num_return_sequences 80 \ + --do_sample \ + --top_p 0.95 \ + --max_input_tokens 125 diff --git a/scripts/decode/en-ja/mistral-ve/top_p_inference_cpo.sh b/scripts/decode/en-ja/mistral-ve/top_p_inference_cpo.sh new file mode 100644 index 0000000000000000000000000000000000000000..e3080622a2320beb3c4da8e234930417eafdedfb --- /dev/null +++ b/scripts/decode/en-ja/mistral-ve/top_p_inference_cpo.sh @@ -0,0 +1,17 @@ +set -eux +LLM_RECIPES_DIR=/code/llm-recipes +source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh + +python /code/llm-recipes/tools/hf_inference_distrubuted.py \ + --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-ve-sim-chunked-docs-all-averaged-596-600 \ + -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \ + -o /work/translation/wmt2024_test/en-ja/mistral-ve-top-p-0.95-cpo \ + -p /work/models/dpo/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-ve-sim-chunked-docs-all-cpo-lora/checkpoint-200 \ + -g 0 1 2 3 4 5 6 7 \ + -b 125 \ + --attn_implementation sdpa \ + --dynamic_max_new_token_ratio 2.0 \ + --num_return_sequences 80 \ + --do_sample \ + --top_p 0.95 \ + --max_input_tokens 125 \ diff --git a/scripts/decode/en-ja/mistral/top_p_inference_2.sh b/scripts/decode/en-ja/mistral/top_p_inference_2.sh new file mode 100644 index 0000000000000000000000000000000000000000..0cfe66af2a59dd2dd679fc7e21c5e38147e7791b --- /dev/null +++ b/scripts/decode/en-ja/mistral/top_p_inference_2.sh @@ -0,0 +1,20 @@ +set -eux +LLM_RECIPES_DIR=/code/llm-recipes +source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh + +for i in `seq 8 9`; do + # minus 2 for gpu id + GPU_ID=$((i-2)) + python /code/llm-recipes/tools/hf_inference.py \ + --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \ + -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i} \ + -o /work/translation/wmt24_test/en-ja/mistral-top-p-0.95/split_0${i} \ + -g ${GPU_ID} \ + -b 400 \ + --attn_implementation sdpa \ + --dynamic_max_new_token_ratio 3.0 \ + --num_return_sequences 100 \ + --do_sample \ + --top_p 0.95 & +done +wait diff --git a/scripts/yans/lm-evaluation-harness/.github/workflows/new_tasks.yml b/scripts/yans/lm-evaluation-harness/.github/workflows/new_tasks.yml new file mode 100644 index 0000000000000000000000000000000000000000..b748aab5c06533fd3f8d41cfd519841a9af93f75 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/.github/workflows/new_tasks.yml @@ -0,0 +1,72 @@ +name: Tasks Modified + +on: + push: + branches: + - 'main' + pull_request: + branches: + - 'main' + workflow_dispatch: +# comment/edit out the above to stop/change the triggers +jobs: + changed_files: + runs-on: ubuntu-latest # windows-latest || macos-latest + timeout-minutes: 120 + name: Scan for changed tasks + steps: + - name: checkout + uses: actions/checkout@v3 + with: + fetch-depth: 2 # OR "2" -> To retrieve the preceding commit. + + # Uses the tj-actions/changed-files action to check for changes. + # Outputs provided here: https://github.com/tj-actions/changed-files#outputs + # The `files_yaml` input optionally takes a yaml string to specify filters, + # and prepends the filter name to the standard output names. + - name: Check task folders + id: changed-tasks + uses: tj-actions/changed-files@v44.5.2 + with: + # tasks checks the tasks folder and api checks the api folder for changes + files_yaml: | + tasks: + - lm_eval/tasks/** + api: + - lm_eval/api/** + write_output_files: true + + # The next step is optional; the files are written to the workspace by default (above). + # so it's just for debugging + - name: Run Tests + if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' + run: | + echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV' + echo "One or more test file(s) has changed." + echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}" + + - name: Set up Python 3.9 + if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' + uses: actions/setup-python@v4 + with: + python-version: 3.9 + cache: 'pip' + cache-dependency-path: setup.py + - name: Install dependencies + if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' + run: | + python -m pip install --upgrade pip + pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu + # Install optional git dependencies + # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt + # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Test with pytest + # if new tasks are added, run tests on them + if: steps.changed-tasks.outputs.tasks_any_modified == 'true' + run: python -m pytest tests/test_tasks.py -s -vv + # if api is modified, run tests on it + - name: Test more tasks with pytest + env: + API: true + if: steps.changed-tasks.outputs.api_any_modified == 'true' + run: python -m pytest tests/test_tasks.py -s -vv diff --git a/scripts/yans/lm-evaluation-harness/.github/workflows/publish.yml b/scripts/yans/lm-evaluation-harness/.github/workflows/publish.yml new file mode 100644 index 0000000000000000000000000000000000000000..be3481754e270f28bcb65e8c75b880aa7ebf2bac --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/.github/workflows/publish.yml @@ -0,0 +1,78 @@ +name: Publish Python distribution to PyPI + +on: + push: + tags: + - '*' + +jobs: + build: + name: Build distribution + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.x" + + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v3 + with: + name: python-package-distributions + path: dist/ + + publish-to-pypi: + name: >- + Publish Python distribution to PyPI + if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + needs: + - build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/lm_eval + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + + steps: + - name: Download all the dists + uses: actions/download-artifact@v3 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + publish-to-testpypi: + name: Publish Python distribution to TestPyPI + needs: + - build + runs-on: ubuntu-latest + + environment: + name: testpypi + url: https://test.pypi.org/p/lm_eval + + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + + steps: + - name: Download all the dists + uses: actions/download-artifact@v3 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ diff --git a/scripts/yans/lm-evaluation-harness/.github/workflows/unit_tests.yml b/scripts/yans/lm-evaluation-harness/.github/workflows/unit_tests.yml new file mode 100644 index 0000000000000000000000000000000000000000..49b85fb9a4541f6c6dfecd4395a4544dc4ec5aac --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/.github/workflows/unit_tests.yml @@ -0,0 +1,95 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python +# just comment out unwanted steps to turn off the test. +name: Unit Tests + +on: + push: + branches: + - 'main' + pull_request: + branches: + - 'main' + workflow_dispatch: +# Jobs run concurrently and steps run sequentially within a job. +# jobs: linter and cpu_tests. Add more jobs/steps as required. +jobs: + linter: + name: Linters + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: 3.8 + cache: pip + cache-dependency-path: pyproject.toml + - name: Pre-Commit + env: + SKIP: "no-commit-to-branch,mypy" + + uses: pre-commit/action@v3.0.1 +# # mypy turned off for now +# - name: Lint with mypy +# run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable +# Job 2 + testcpu: + name: CPU Tests + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [ "3.8", "3.9", "3.10", "3.11" ] + timeout-minutes: 30 + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + cache-dependency-path: pyproject.toml + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e '.[dev,sentencepiece,api]' --extra-index-url https://download.pytorch.org/whl/cpu +# Install optional git dependencies +# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt +# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Test with pytest + run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py + - name: Archive artifacts + uses: actions/upload-artifact@v3 + with: + name: output_results + path: | + test_logs/* + testmodels: + name: External LM Tests + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: 3.8 + cache: pip + cache-dependency-path: pyproject.toml + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu + - name: Test with pytest + run: python -m pytest tests/models --showlocals -s -vv + - name: Archive artifacts + uses: actions/upload-artifact@v3 + with: + name: output_results + path: | + test_logs/* diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__init__.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/__init__.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd8d5da70352396b88119f1cd2c83f04b44dda96 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/__init__.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/filter.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/filter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..600b26076c58d09bf2afd2f42873a287dfd3827f Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/filter.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/group.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/group.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..adb21c6123f66da878826f08203d06c2ee84bafa Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/group.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/instance.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/instance.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61f695e8d01399953a4e6337655904008c885020 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/instance.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/metrics.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/metrics.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..539173175d4d22ee449401c400b33aa776b35880 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/metrics.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/model.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba146834fe2c7611fef4b73d6f56056d23e63794 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/model.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/registry.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/registry.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b7035b7408d03e277256a87ecd5c3511ca957e5 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/registry.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/samplers.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/samplers.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9eef8d4a193e4a1e324db212fbbc2ec43961b613 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/samplers.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/task.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/task.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f16dab506fe90e374c3fd993ba602ba732ef0dc Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/task.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/filter.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/filter.py new file mode 100644 index 0000000000000000000000000000000000000000..8d9db6821724c497c4a27116a1238e3b8d32ae29 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/filter.py @@ -0,0 +1,56 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Callable, Iterable, List, Union + +from lm_eval.api.instance import Instance + + +class Filter(ABC): + """ + Filter classes operate on a per-task level. + They take all model outputs (`instance.resps` for all `task.instances`) + across all instances of a task, and perform operations. + In a single run, one can configure any number of separate filters or lists of filters. + + """ + + def __init__(self, **kwargs) -> None: + """ + Can define custom behavior here, if an individual instantiation of a Filter class should have state. + """ + + @abstractmethod + def apply(self, resps: Union[List, Iterable], docs: List[dict]) -> Iterable: + """ + Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects. + Should return the list of (filtered) response lists *in the same order as they were input*, e.g. + if pass in [, ] should return + [, ] + """ + return resps + + +@dataclass +class FilterEnsemble: + """ + FilterEnsemble creates a pipeline applying multiple filters. + Its intended usage is to stack multiple post-processing steps in order. + `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each + pipeline separately. + """ + + name: str + filters: List[Callable[[], Filter]] + + def apply(self, instances: List[Instance]) -> None: + resps, docs = zip(*((inst.resps, inst.doc) for inst in instances)) + resps, docs = list(resps), list(docs) + + for f in self.filters: + # apply filters in sequence + resps = f().apply(resps, docs) + + # add the end results after filtering to filtered_requests of their respective source instances. + # has key `self.name`: each FilterEnsemble applied in a given run should use a different name. + for inst, resp in zip(instances, resps): + inst.filtered_resps[self.name] = resp diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/group.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/group.py new file mode 100644 index 0000000000000000000000000000000000000000..534e6ad0103ee5aa79c6badc5550b1b355b718f7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/group.py @@ -0,0 +1,117 @@ +import abc +from dataclasses import asdict, dataclass +from inspect import getsource +from typing import Any, Callable, List, Optional, Union + + +@dataclass +class AggMetricConfig(dict): + metric: Optional[str] = None + aggregation: Optional[str] = "mean" + weight_by_size: Optional[str] = False + # list of filter names which should be incorporated into the aggregated metric. + filter_list: Optional[Union[str, list]] = "none" + + def __post_init__(self): + if self.aggregation != "mean": + raise ValueError( + f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{self.aggregation}'." + ) + + if isinstance(self.filter_list, str): + self.filter_list = [self.filter_list] + + +@dataclass +class GroupConfig(dict): + group: Optional[str] = None + group_alias: Optional[str] = None + task: Optional[Union[str, list]] = None + aggregate_metric_list: Optional[ + Union[List[AggMetricConfig], AggMetricConfig, dict] + ] = None + metadata: Optional[dict] = ( + None # by default, not used in the code. allows for users to pass arbitrary info to tasks + ) + + def __getitem__(self, item): + return getattr(self, item) + + def __setitem__(self, item, value): + return setattr(self, item, value) + + def __post_init__(self): + if self.aggregate_metric_list is not None: + if isinstance(self.aggregate_metric_list, dict): + self.aggregate_metric_list = [self.aggregate_metric_list] + + self.aggregate_metric_list = [ + AggMetricConfig(**item) if isinstance(item, dict) else item + for item in self.aggregate_metric_list + ] + + def to_dict(self, keep_callable: bool = False) -> dict: + """dumps the current config as a dictionary object, as a printable format. + null fields will not be printed. + Used for dumping results alongside full task configuration + + :return: dict + A printable dictionary version of the TaskConfig object. + + # TODO: should any default value in the TaskConfig not be printed? + """ + cfg_dict = asdict(self) + # remove values that are `None` + for k, v in list(cfg_dict.items()): + if callable(v): + cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable) + return cfg_dict + + def serialize_function( + self, value: Union[Callable, str], keep_callable=False + ) -> Union[Callable, str]: + """Serializes a given function or string. + + If 'keep_callable' is True, the original callable is returned. + Otherwise, attempts to return the source code of the callable using 'getsource'. + """ + if keep_callable: + return value + else: + try: + return getsource(value) + except (TypeError, OSError): + return str(value) + + +class ConfigurableGroup(abc.ABC): + def __init__( + self, + config: Optional[dict] = None, + ) -> None: + self._config = GroupConfig(**config) + + @property + def group(self): + return self._config.group + + @property + def group_alias(self): + return self._config.group_alias + + @property + def version(self): + return self._config.version + + @property + def config(self): + return self._config.to_dict() + + @property + def group_name(self) -> Any: + return self._config.group + + def __repr__(self): + return ( + f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})" + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/instance.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/instance.py new file mode 100644 index 0000000000000000000000000000000000000000..d3c6afa0644e729ba441728c72a2469fdad07b8f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/instance.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass, field +from typing import Literal, Optional, Tuple + + +OutputType = Literal[ + "loglikelihood", "loglikelihood_rolling", "generate_until", "multiple_choice" +] + + +@dataclass +class Instance: + request_type: OutputType + doc: dict + arguments: tuple + idx: int + metadata: Tuple[Optional[str], Optional[int], Optional[int]] = field( + default_factory=lambda: (None, None, None) + ) + resps: list = field(default_factory=list) + filtered_resps: dict = field(default_factory=dict) + + # initialized after init + task_name: Optional[str] = None + doc_id: Optional[int] = None + repeats: Optional[int] = None + + def __post_init__(self) -> None: + # unpack metadata field + self.task_name, self.doc_id, self.repeats = self.metadata + + @property + def args(self): + """ + Returns (string,) where `string` is the string to calculate loglikelihood over + """ + return ( + self.arguments if isinstance(self.arguments, tuple) else (self.arguments,) + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/metrics.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..a8459aa7397fd02947917dad616520bb4cb777bd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/metrics.py @@ -0,0 +1,570 @@ +import logging +import math +import random +import re +import string +from collections.abc import Iterable +from typing import List + +import numpy as np +import sacrebleu + +from lm_eval.api.registry import register_aggregation, register_metric + + +eval_logger = logging.getLogger("lm-eval") + + +# Register Aggregations First +@register_aggregation("bypass") +def bypass_agg(arr): + return 999 + + +@register_aggregation("mean") +def mean(arr): + return sum(arr) / len(arr) + + +@register_aggregation("median") +def median(arr): + return arr[len(arr) // 2] + + +# Certain metrics must be calculated across all documents in a benchmark. +# We use them as aggregation metrics, paired with no-op passthrough metric fns. +@register_aggregation("perplexity") +def perplexity(items): + return math.exp(-mean(items)) + + +@register_aggregation("weighted_perplexity") +def weighted_perplexity(items): + return math.exp(-weighted_mean(items)) + + +@register_aggregation("bits_per_byte") +def bits_per_byte(items): + return -weighted_mean(items) / math.log(2) + + +@register_aggregation("f1") +def f1_score(items): + from sklearn.metrics import f1_score + + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds) + + return np.max(fscore) + + +@register_aggregation("matthews_corrcoef") +def matthews_corrcoef(items): + from sklearn.metrics import matthews_corrcoef + + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + return matthews_corrcoef(golds, preds) + + +@register_aggregation("bleu") +def bleu(items): + """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric + for evaluating a generated sentence to a reference sentence. It counts matching + n-grams in the candidate translation to n-grams in the reference text, where + 1-gram or unigram would be each token and a bigram comparison would be each + word pair. The comparison is made regardless of word order + Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/ + Paper: https://www.aclweb.org/anthology/P02-1040/ + + Higher is better + """ + refs = list(zip(*items))[0] + preds = list(zip(*items))[1] + refs, preds = _sacreformat(refs, preds) + return sacrebleu.corpus_bleu(preds, refs).score + + +@register_aggregation("chrf") +def chrf(items): + """chrF++ is a tool for automatic evaluation of machine translation output + based on character n-gram precision and recall enhanced with word n-grams. + Source: https://github.com/m-popovic/chrF + Paper: https://www.aclweb.org/anthology/W15-3049.pdf + + Higher is better # TODO I think + """ + refs = list(zip(*items))[0] + preds = list(zip(*items))[1] + refs, preds = _sacreformat(refs, preds) + return sacrebleu.corpus_chrf(preds, refs).score + + +@register_aggregation("ter") +def ter(items): + """Translation Error Rate is an error metric for machine translation that + measures the number of edits required to change a system output into one + of the references + Source: http://www.cs.umd.edu/~snover/tercom/ + Paper: http://mt-archive.info/AMTA-2006-Snover.pdf + + Lower is better + """ + refs = list(zip(*items))[0] + preds = list(zip(*items))[1] + refs, preds = _sacreformat(refs, preds) + return sacrebleu.corpus_ter(preds, refs).score + + +@register_aggregation("brier_score") +def brier_score(items): # This is a passthrough function + gold, predictions = list(zip(*items)) + bs, num_class = np.array(predictions).shape + + gold = list(gold) + gold_one_hot = np.eye(num_class)[gold] + return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1)) + + +@register_metric( + metric="brier_score", + higher_is_better=False, + output_type=["multiple_choice"], + aggregation="brier_score", +) +def brier_score_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="acc", + higher_is_better=True, + output_type=["loglikelihood", "multiple_choice"], + aggregation="mean", +) +def acc_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="acc_norm", + higher_is_better=True, + output_type=["loglikelihood", "multiple_choice"], + aggregation="mean", +) +def acc_norm_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="acc_mutual_info", + higher_is_better=True, + output_type="multiple_choice", + aggregation="mean", +) +def acc_mutual_info_fn(items): # This is a passthrough function + return items + + +### the code used in the `exact_match_hf_evaluate` function is ported from +### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py +### which is under the apache license. + +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +def exact_match_hf_evaluate( + predictions, + references, + regexes_to_ignore=None, + ignore_case=False, + ignore_punctuation=False, + ignore_numbers=False, +): + if regexes_to_ignore is not None: + for s in regexes_to_ignore: + predictions = np.array([re.sub(s, "", x) for x in predictions]) + references = np.array([re.sub(s, "", x) for x in references]) + else: + predictions = np.asarray(predictions) + references = np.asarray(references) + + if ignore_case: + predictions = np.char.lower(predictions) + references = np.char.lower(references) + + if ignore_punctuation: + repl_table = string.punctuation.maketrans("", "", string.punctuation) + predictions = np.char.translate(predictions, table=repl_table) + references = np.char.translate(references, table=repl_table) + + if ignore_numbers: + repl_table = string.digits.maketrans("", "", string.digits) + predictions = np.char.translate(predictions, table=repl_table) + references = np.char.translate(references, table=repl_table) + + score_list = predictions == references + + return {"exact_match": np.mean(score_list)} + + +### + + +@register_metric( + metric="exact_match", + higher_is_better=True, + output_type="generate_until", + aggregation="mean", +) +def exact_match_fn(**kwargs): + return exact_match_hf_evaluate(**kwargs) + + +@register_metric( + metric="perplexity", + higher_is_better=False, + output_type="loglikelihood", + aggregation="perplexity", +) +def perplexity_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="word_perplexity", + higher_is_better=False, + output_type="loglikelihood_rolling", + aggregation="weighted_perplexity", +) +def word_perplexity_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="byte_perplexity", + higher_is_better=False, + output_type="loglikelihood_rolling", + aggregation="weighted_perplexity", +) +def byte_perplexity_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="bits_per_byte", + higher_is_better=False, + output_type="loglikelihood_rolling", + aggregation="bits_per_byte", +) +def bits_per_byte_fn(items): # This is a passthrough function + return items + + +def pop_stddev(arr): + mu = mean(arr) + return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr)) + + +def sample_stddev(arr): + mu = mean(arr) + return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1)) + + +def mean_stderr(arr): + return sample_stddev(arr) / math.sqrt(len(arr)) + + +@register_metric( + metric="bypass", + higher_is_better=True, + output_type=["loglikelihood", "multiple_choice", "generate_until"], + aggregation="bypass", +) +def bypass(items): + return None + + +@register_metric( + metric="mcc", + higher_is_better=True, + output_type="multiple_choice", + aggregation="matthews_corrcoef", +) +def mcc_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="f1", + higher_is_better=True, + output_type="multiple_choice", + aggregation="f1", +) +def f1_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="bleu", + higher_is_better=True, + output_type="generate_until", + aggregation="bleu", +) +def bleu_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="chrf", + higher_is_better=True, + output_type="generate_until", + aggregation="chrf", +) +def chrf_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="ter", + higher_is_better=True, + output_type="generate_until", + aggregation="ter", +) +def ter_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="acc_all", + higher_is_better=True, + output_type="loglikelihood", + aggregation="mean", +) +def acc_all(items): + # Only count as correct if all answers are labeled correctly for each question + question_scoring_dict = {} + preds = list(zip(*items))[0] + docs = list(zip(*items))[1] + + for doc, pred in zip(docs, preds): + paragraph_id = doc["idx"]["paragraph"] + question_id = doc["idx"]["question"] + if (paragraph_id, question_id) not in question_scoring_dict: + question_scoring_dict[(paragraph_id, question_id)] = [] + + gold_label = doc["label"] == 1 + + question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred) + acc = np.mean([int(all(x)) for x in question_scoring_dict.values()]) + return acc + + +def acc_all_stderr(items): + # Only count as correct if all answers are labeled correctly for each question + question_scoring_dict = {} + preds = list(zip(*items))[0] + docs = list(zip(*items))[1] + + for doc, pred in zip(docs, preds): + question_id = doc["idx"]["question"] + if question_id not in question_scoring_dict: + question_scoring_dict[question_id] = [] + + gold_label = doc["label"] == 1 + question_scoring_dict[question_id].append(gold_label == pred) + + acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()]) + return acc + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + """Compute max metric between prediction and each ground truth.""" + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def weighted_mean(items): + a, b = zip(*items) + return sum(a) / sum(b) + + +def is_non_str_iterable(obj): + return isinstance(obj, Iterable) and not isinstance(obj, str) + + +def _sacreformat(refs, preds): + """Format refs and preds for sacrebleu corpus calculation. It is very particular""" + # Sacrebleu expects (List[str], List[List[str]) + # e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...]) + + # Note [ref1_stream] is the first reference for each pred. + # So lists are size N and (M, N) for N preds and M possible refs for each pred + # This is a different order of dimensions that I would expect + + # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds + # Must become List[List[str]] with the inner list corresponding to preds + if not is_non_str_iterable(refs): + refs = list(refs) + if not is_non_str_iterable(refs[0]): + refs = [[ref] for ref in refs] + refs = list(zip(*refs)) + # Note the number of refs in each ref list much match the number of preds + + # We expect preds to be List[str] or List[List[str]]. Must become List[str] + if not is_non_str_iterable(preds): + preds = list(preds) + if is_non_str_iterable(preds[0]): + assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}" + preds = [pred[0] for pred in preds] + + return refs, preds + + +# stderr stuff + + +class _bootstrap_internal: + def __init__(self, f, n) -> None: + self.f = f + self.n = n + + def __call__(self, v): + i, xs = v + rnd = random.Random() + rnd.seed(i) + res = [] + for _ in range(self.n): + res.append(self.f(rnd.choices(xs, k=len(xs)))) + return res + + +def bootstrap_stderr(f, xs, iters): + import multiprocessing as mp + + pool = mp.Pool(mp.cpu_count()) + # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something + # equivalent to stderr calculated without Bessel's correction in the stddev. + # Unfortunately, I haven't been able to figure out what the right correction is + # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but + # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator) + # Thankfully, shouldn't matter because our samples are pretty big usually anyways + res = [] + chunk_size = min(1000, iters) + from tqdm import tqdm + + print("bootstrapping for stddev:", f.__name__) + for bootstrap in tqdm( + pool.imap( + _bootstrap_internal(f, chunk_size), + [(i, xs) for i in range(iters // chunk_size)], + ), + total=iters // chunk_size, + ): + # sample w replacement + res.extend(bootstrap) + + pool.close() + return sample_stddev(res) + + +def stderr_for_metric(metric, bootstrap_iters: int): + if bootstrap_iters <= 0: + # return no function (don't compute stderr) if bootstrap iters = 0 + return None + + bootstrappable = [ + median, + matthews_corrcoef, + f1_score, + perplexity, + bleu, + chrf, + ter, + ] + + if metric in bootstrappable: + return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters) + + stderr = {mean: mean_stderr, acc_all: acc_all_stderr} + + return stderr.get(metric, None) + + +def pooled_sample_stderr(stderrs: List[float], sizes: List[int]): + # Used to aggregate bootstrapped stderrs across subtasks in a group, + # when we are weighting by the size of each subtask. + # + + assert len(stderrs) == len(sizes) + + # formula source: https://en.wikipedia.org/wiki/Pooled_variance + # and: https://stats.stackexchange.com/a/4841331 + # this empirically seems to match running `stderr_for_metric` on all instances + # from the subtasks concatenated with each other. + pooled_sample_var = ( + sum([(size - 1) * stderr**2 * size for size, stderr in zip(sizes, stderrs)]) + ) / (sum(sizes) - len(sizes)) + + return np.sqrt(pooled_sample_var / sum(sizes)) + + +def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None): + assert ( + metrics is not None + ), "Need to pass a list of each subtask's metric for this stderr aggregation" + assert len(stderrs) == len(sizes) and len(sizes) == len(metrics) + + # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation. + # This formula depends on sample means. + # removed because it seems to give erroneously huge stderrs for groupings of tasks + # and does not seem to match up with bootstrap-calculated stderrs for groups. + + ### don't use this unless a statistician has told you it's the right thing to do ### + + # accumulators: we'll aggregate pairwise N - 1 times + variance = stderrs[0] ** 2 + curr_size = sizes[0] + curr_score = metrics[0] + + for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]): + curr_score = ((curr_score * curr_size) + (score * size)) / ( + curr_size + size + ) # NOTE: this assumes our aggregation fn is "mean" + + variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / ( + curr_size + size - 1 + ) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * ( + curr_score - score + ) ** 2 + + return np.sqrt(variance) + + +def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True): + # A helper function that is used to aggregate + # subtask scores cross-task. + # TODO: does not hold for non-mean aggregations + if not weight_by_size: + sizes = [1] * len(sizes) + + assert len(metrics) == len(sizes) + + return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/model.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/model.py new file mode 100644 index 0000000000000000000000000000000000000000..a3602736d230b196eac4d384978ae1b62b7b4fe2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/model.py @@ -0,0 +1,385 @@ +import abc +import hashlib +import json +import logging +import os +from typing import Dict, List, Optional, Tuple, Type, TypeVar + +import transformers +from sqlitedict import SqliteDict +from tqdm import tqdm + +from lm_eval import utils + + +eval_logger = logging.getLogger("lm-eval") + +T = TypeVar("T", bound="LM") + + +class LM(abc.ABC): + def __init__(self) -> None: + """Defines the interface that should be implemented by all LM subclasses. + LMs are assumed to take text (strings) as input and yield strings as output + (inputs/outputs should be tokenization-agnostic.) + + """ + # set rank and world size to a single process, by default. + self._rank = 0 + self._world_size = 1 + self.cache_hook = CacheHook(None) + + @abc.abstractmethod + def loglikelihood(self, requests) -> List[Tuple[float, bool]]: + """Compute log-likelihood of generating a continuation from a context. + Downstream tasks should attempt to use loglikelihood instead of other + LM calls whenever possible. + + :param requests: list[Instance] + A list of Instance objects, with property `args` which returns a tuple (context, continuation). + `context: str` + Context string. Implementations of LM must be able to handle an + empty context string. + `continuation: str` + The continuation over which log likelihood will be calculated. If + there is a word boundary, the space should be in the continuation. + For example, context="hello" continuation=" world" is correct. + + :return: list[tuple[float, bool]] + A list of pairs (logprob, isgreedy) + `logprob: float` + The log probability of `continuation`. + `isgreedy`: + Whether `continuation` would be generated by greedy sampling from `context`. + """ + pass + + @abc.abstractmethod + def loglikelihood_rolling(self, requests) -> List[float]: + """Compute full log-likelihood of a string, with no truncation, for perplexity computation + - We will use the full max context length of the model. + - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to + the max context length. + - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations + which may simply concatenate multiple documents together. + - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into + multiple chunks, the last input will still a full-sized context. + Example: + Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ] + Prefix: BOS/EOS + Max context length: 4 + Resulting input/prediction pairs: + + INPUT: BOS 0 1 2 + PRED: 0 1 2 3 + + INPUT: 3 4 5 6 + PRED: 4 5 6 7 + + INPUT: 5 6 7 8 + PRED: 8 9 + + Observe that: + 1. Each token is predicted exactly once + 2. For the last pair, we provide the full context, but only score the last two tokens + + :param requests: list[Instance] + A list of Instance objects with property `args` which returns a tuple (context,). + string: str + String for which we are computing overall loglikelihood + :return: list[tuple[float]] + A list of tuples (logprob,) + logprob: float + The log probability of `context` conditioned on the BOS/EOS token. + Can also be overridden for custom cases by `prefix_token_id`. + """ + pass + + # TODO: Add an optional max length + @abc.abstractmethod + def generate_until(self, requests) -> List[str]: + """Generate greedily until a stopping sequence + + :param requests: list[Instance] + A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs). + context: str + Context string + gen_kwargs: dict + A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc. + :return: list[str] + A list of model generated continuations. + continuation: str + The generated continuation. + """ + pass + + def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str: + """ + Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM. + + :param chat_history: list[dict[str, str]] + A list of dictionaries with keys 'role' and 'content'. + Values are strings representing the role name and the content of the message, respectively. + :return: str + A string representing the chat history in a format that can be used as input to the LM. + """ + raise NotImplementedError( + "To use this model with chat templates, please implement the 'apply_chat_template' method for your model type." + ) + + @classmethod + def create_from_arg_string( + cls: Type[T], arg_string: str, additional_config: Optional[dict] = None + ) -> T: + """ + Creates an instance of the LM class using the given argument string and additional config. + + Parameters: + - arg_string: A string containing arguments in the format key1=value1,key2=value2. + - additional_config: Optional dictionary containing additional configuration parameters. + + Returns: + - Instance of the LM class. + """ + additional_config = {} if additional_config is None else additional_config + args = utils.simple_parse_args_string(arg_string) + args2 = {k: v for k, v in additional_config.items() if v is not None} + return cls(**args, **args2) + + @classmethod + def create_from_arg_obj( + cls: Type[T], arg_dict: dict, additional_config: Optional[dict] = None + ) -> T: + """ + Creates an instance of the LM class using the given arg_obj + + Parameters: + - arg_obj: A dict containing arguments in the format key1=value1,key2=value2. + - additional_config: Optional dictionary containing additional configuration parameters. + + Returns: + - Instance of the LM class. + """ + + additional_config = {} if additional_config is None else additional_config + additional_config = { + k: v for k, v in additional_config.items() if v is not None + } + + return cls(**arg_dict, **additional_config) + + @property + def rank(self): + # used in the case of parallelism. Hardcoded to + # ensure no errors arise using API models which do + # not support multi-device parallelism nor expect it. + return self._rank + + @property + def world_size(self): + # used in the case of parallelism. Hardcoded to + # ensure no errors arise using API models which do + # not support multi-device parallelism nor expect it. + return self._world_size + + @property + def tokenizer_name(self) -> str: + """Must be defined for LM subclasses which implement Chat Templating. + Should return the name of the tokenizer or chat template used. + Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used. + """ + raise NotImplementedError( + "To use this model with chat templates, please implement the 'tokenizer_name' property." + ) + + @property + def chat_template(self) -> str: + """Must be defined for LM subclasses that implement Chat Templating. + Should return the structure of the chat template applied to user/assistant messages. + This is used only to save in the experiment results for reproducibility. + """ + raise NotImplementedError( + "To use this model with chat templates, please implement the 'chat_template' property." + ) + + def set_cache_hook(self, cache_hook) -> None: + self.cache_hook = cache_hook + + +### SQLite-based caching of LM responses +def hash_args(attr, args): + dat = json.dumps([attr] + list(args)) + return hashlib.sha256(dat.encode("utf-8")).hexdigest() + + +class CacheHook: + def __init__(self, cachinglm) -> None: + if cachinglm is None: + self.dbdict = None + return + + self.dbdict = cachinglm.dbdict + + def add_partial(self, attr, req, res) -> None: + if self.dbdict is None: + return + hsh = hash_args(attr, req) + self.dbdict[hsh] = res + + +class CachingLM: + def __init__(self, lm, cache_db) -> None: + """LM wrapper that returns cached results if they exist, and uses the underlying LM if not. + + :param lm: LM + Underlying LM + :param cache_db: str + Path to cache db + """ + self.lm = lm + self.cache_db = cache_db + if os.path.dirname(cache_db): + os.makedirs(os.path.dirname(cache_db), exist_ok=True) + self.dbdict = SqliteDict(cache_db, autocommit=True) + + # add hook to lm + lm.set_cache_hook(self.get_cache_hook()) + + def __getattr__(self, attr: str): + lm_attr = getattr(self.lm, attr) + if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]: + eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM") + return lm_attr + + def fn(requests): + res = [] + remaining_reqs = [] + warned = False + # figure out which ones are cached and which ones are new + eval_logger.info( + f"Loading '{attr}' responses from cache '{self.cache_db}' where possible..." + ) + for req in tqdm(requests, desc="Checking cached requests"): + hsh = hash_args(attr, req.args) + if attr == "generate_until" and req.args[1].get("do_sample", False): + # when we are doing non-greedy generation, don't use the cache + # (else every "randomly sampled" generation would be identical for repeats > 1). + if not warned: + eval_logger.warning( + f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests." + ) + warned = True + res.append(None) + remaining_reqs.append(req) + elif hsh in self.dbdict: + ob = self.dbdict[hsh] + + assert ob is not None + + res.append(ob) + else: + res.append(None) + remaining_reqs.append(req) + eval_logger.info( + f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}" + ) + # actually run the LM on the requests that do not have cached results + rem_res = getattr(self.lm, attr)(remaining_reqs) + + # stick the new ones back into the list and also cache any of the new ones + resptr = 0 + for req, r in zip(remaining_reqs, rem_res): + while res[resptr] is not None: + resptr += 1 + + res[resptr] = r + + # caching + hsh = hash_args(attr, req.args) + self.dbdict[hsh] = r + self.dbdict.commit() + + return res + + return fn + + def get_cache_hook(self): + return CacheHook(self) + + +class TemplateLM(LM): + """ + A class acting as intermediary between the LM base class + and boilerplate often included in other LM subclasses. + """ + + @property + @abc.abstractmethod + def eot_token_id(self): + pass + + @property + def prefix_token_id(self): + # it is used as prefix for loglikelihood + return self.eot_token_id + + @abc.abstractmethod + def tok_encode(self, string: str, **kwargs) -> List[int]: + """ + Tokenize a string using the model's tokenizer and return a list of token IDs. + """ + pass + + @abc.abstractmethod + def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]: + pass + + def _encode_pair( + self, context: str, continuation: str + ) -> Tuple[List[int], List[int]]: + n_spaces = len(context) - len(context.rstrip()) + if n_spaces > 0: + continuation = context[-n_spaces:] + continuation + context = context[:-n_spaces] + + model_class = getattr(self, "AUTO_MODEL_CLASS", None) + + if model_class == transformers.AutoModelForSeq2SeqLM: + context_enc = self.tok_encode(context) + continuation_enc = self.tok_encode(continuation, add_special_tokens=False) + else: + whole_enc = self.tok_encode(context + continuation) + context_enc = self.tok_encode(context) + + context_enc_len = len(context_enc) + continuation_enc = whole_enc[context_enc_len:] + + return context_enc, continuation_enc + + def loglikelihood( + self, requests, disable_tqdm: bool = False + ) -> List[Tuple[float, bool]]: + new_reqs = [] + for context, continuation in [req.args for req in requests]: + if context == "": + # BOS or EOS as context + context_enc, continuation_enc = ( + [self.prefix_token_id], + self.tok_encode(continuation), + ) + else: + context_enc, continuation_enc = self._encode_pair(context, continuation) + + new_reqs.append(((context, continuation), context_enc, continuation_enc)) + + return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm) + + @abc.abstractmethod + def loglikelihood_rolling( + self, requests, disable_tqdm: bool = False + ) -> List[float]: + pass + + @abc.abstractmethod + def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: + pass diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/registry.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..7446a429e61d9b287c384b5be5db2a258ea83ae8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/registry.py @@ -0,0 +1,192 @@ +import logging +from typing import Callable, Dict + +import evaluate as hf_evaluate + +from lm_eval.api.model import LM + + +eval_logger = logging.getLogger("lm-eval") + +MODEL_REGISTRY = {} + + +def register_model(*names): + # either pass a list or a single alias. + # function receives them as a tuple of strings + + def decorate(cls): + for name in names: + assert issubclass( + cls, LM + ), f"Model '{name}' ({cls.__name__}) must extend LM class" + + assert ( + name not in MODEL_REGISTRY + ), f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead." + + MODEL_REGISTRY[name] = cls + return cls + + return decorate + + +def get_model(model_name): + try: + return MODEL_REGISTRY[model_name] + except KeyError: + raise ValueError( + f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}" + ) + + +TASK_REGISTRY = {} +GROUP_REGISTRY = {} +ALL_TASKS = set() +func2task_index = {} + + +def register_task(name): + def decorate(fn): + assert ( + name not in TASK_REGISTRY + ), f"task named '{name}' conflicts with existing registered task!" + + TASK_REGISTRY[name] = fn + ALL_TASKS.add(name) + func2task_index[fn.__name__] = name + return fn + + return decorate + + +def register_group(name): + def decorate(fn): + func_name = func2task_index[fn.__name__] + if name in GROUP_REGISTRY: + GROUP_REGISTRY[name].append(func_name) + else: + GROUP_REGISTRY[name] = [func_name] + ALL_TASKS.add(name) + return fn + + return decorate + + +OUTPUT_TYPE_REGISTRY = {} +METRIC_REGISTRY = {} +METRIC_AGGREGATION_REGISTRY = {} +AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {} +HIGHER_IS_BETTER_REGISTRY = {} +FILTER_REGISTRY = {} + +DEFAULT_METRIC_REGISTRY = { + "loglikelihood": [ + "perplexity", + "acc", + ], + "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"], + "multiple_choice": ["acc", "acc_norm"], + "generate_until": ["exact_match"], +} + + +def register_metric(**args): + # TODO: do we want to enforce a certain interface to registered metrics? + def decorate(fn): + assert "metric" in args + name = args["metric"] + + for key, registry in [ + ("metric", METRIC_REGISTRY), + ("higher_is_better", HIGHER_IS_BETTER_REGISTRY), + ("aggregation", METRIC_AGGREGATION_REGISTRY), + ]: + if key in args: + value = args[key] + assert ( + value not in registry + ), f"{key} named '{value}' conflicts with existing registered {key}!" + + if key == "metric": + registry[name] = fn + elif key == "aggregation": + registry[name] = AGGREGATION_REGISTRY[value] + else: + registry[name] = value + + return fn + + return decorate + + +def get_metric(name: str, hf_evaluate_metric=False) -> Callable: + if not hf_evaluate_metric: + if name in METRIC_REGISTRY: + return METRIC_REGISTRY[name] + else: + eval_logger.warning( + f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..." + ) + + try: + metric_object = hf_evaluate.load(name) + return metric_object.compute + except Exception: + eval_logger.error( + f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric", + ) + + +def register_aggregation(name: str): + def decorate(fn): + assert ( + name not in AGGREGATION_REGISTRY + ), f"aggregation named '{name}' conflicts with existing registered aggregation!" + + AGGREGATION_REGISTRY[name] = fn + return fn + + return decorate + + +def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]: + try: + return AGGREGATION_REGISTRY[name] + except KeyError: + eval_logger.warning(f"{name} not a registered aggregation metric!") + + +def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]: + try: + return METRIC_AGGREGATION_REGISTRY[name] + except KeyError: + eval_logger.warning(f"{name} metric is not assigned a default aggregation!") + + +def is_higher_better(metric_name) -> bool: + try: + return HIGHER_IS_BETTER_REGISTRY[metric_name] + except KeyError: + eval_logger.warning( + f"higher_is_better not specified for metric '{metric_name}'!" + ) + + +def register_filter(name): + def decorate(cls): + if name in FILTER_REGISTRY: + eval_logger.info( + f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}" + ) + FILTER_REGISTRY[name] = cls + return cls + + return decorate + + +def get_filter(filter_name: str) -> type: + try: + return FILTER_REGISTRY[filter_name] + except KeyError: + eval_logger.warning(f"filter `{filter_name}` is not registered!") diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/samplers.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/samplers.py new file mode 100644 index 0000000000000000000000000000000000000000..94e101729c8eb48dc10066a0114f3ba1f60a1307 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/samplers.py @@ -0,0 +1,198 @@ +from functools import partial + +import datasets + + +class ContextSampler: + def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None: + self.rnd = rnd + if not self.rnd: + raise ValueError( + "A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!" + ) + + self.task = task + self.config = task._config + + self.target_delimiter = self.config.target_delimiter + self.fewshot_delimiter = self.config.fewshot_delimiter + + if ( + self.config.fewshot_config is not None + and self.config.fewshot_config.get("doc_to_text", None) is not None + ): + self.doc_to_text = partial( + self.task.doc_to_text, + doc_to_text=self.config.fewshot_config.get("doc_to_text", None), + ) + else: + self.doc_to_text = self.task.doc_to_text + + if ( + self.config.fewshot_config is not None + and self.config.fewshot_config.get("doc_to_target", None) is not None + ): + self.doc_to_target = partial( + self.task.doc_to_target, + doc_to_target=self.config.fewshot_config.get("doc_to_target", None), + ) + else: + self.doc_to_target = self.task.doc_to_target + + if ( + self.config.fewshot_config is not None + and self.config.fewshot_config.get("doc_to_choice", None) is not None + ): + self.doc_to_choice = partial( + self.task.doc_to_choice, + doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None), + ) + else: + self.doc_to_choice = self.task.doc_to_choice + + self.docs = docs # HF dataset split, provided by task._fewshot_docs() + if fewshot_indices: # subset few-shot docs from + if not isinstance(self.docs, datasets.Dataset): + raise ValueError( + "Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously" + ) + self.docs = self.docs.select(fewshot_indices) + + def get_context(self, doc, num_fewshot): + # draw an extra fewshot sample if using same split as evaluating on + n_samples = ( + num_fewshot + 1 + if self.config.fewshot_split == self.config.test_split + else num_fewshot + ) + + # draw `n_samples` docs from fewshot_docs + fewshotex = self.sample(n_samples) + + # get rid of the doc that's the one we're evaluating, if it's in the fewshot + # TODO: should we just stop people from using fewshot from same split as evaluating? + selected_docs = [x for x in fewshotex if x != doc][:num_fewshot] + + labeled_examples = "" + for doc in selected_docs: + doc_content = self.doc_to_text(doc) + doc_target = self.doc_to_target(doc) + labeled_examples += ( + doc_content + if self.config.doc_to_choice is None or isinstance(doc_content, str) + else self.doc_to_choice(doc)[doc_content] + ) + labeled_examples += self.target_delimiter + if doc_target != "": + labeled_examples += ( + str(doc_target[0]) + if isinstance(doc_target, list) + else doc_target + if self.config.doc_to_choice is None or isinstance(doc_target, str) + else str(self.doc_to_choice(doc)[doc_target]) + ) + labeled_examples += self.fewshot_delimiter + + return labeled_examples + + def get_chat_context( + self, + doc, + num_fewshot, + fewshot_as_multiturn: bool = False, + ): + chat_history = [] + # draw an extra fewshot sample if using same split as evaluating on + n_samples = ( + num_fewshot + 1 + if self.config.fewshot_split == self.config.test_split + else num_fewshot + ) + # draw `n_samples` docs from fewshot_docs + fewshotex = self.sample(n_samples) + + # get rid of the doc that's the one we're evaluating, if it's in the fewshot + # TODO: should we just stop people from using fewshot from same split as evaluating? + selected_docs = [x for x in fewshotex if x != doc][:num_fewshot] + + if fewshot_as_multiturn: + for doc in selected_docs: + doc_content = self.doc_to_text(doc) + doc_target = self.doc_to_target(doc) + chat_history.append( + { + "role": "user", + "content": doc_content + if self.config.doc_to_choice is None + or isinstance(doc_content, str) + else self.doc_to_choice(doc)[doc_content], + } + ) + chat_history.append( + { + "role": "assistant", + "content": str(doc_target[0]) + if isinstance(doc_target, list) + else doc_target + if self.config.doc_to_choice is None + or isinstance(doc_target, str) + else str(self.doc_to_choice(doc)[doc_target]), + } + ) + else: + # get fewshot context as one user turn + chat_history.append( + {"role": "user", "content": self.get_context(doc, num_fewshot)} + ) + + return chat_history + + def sample(self, n): + """ + Draw `n` samples from our fewshot docs. This method should be overridden by subclasses. + """ + + return self.rnd.sample(self.docs, n) + + +class FirstNSampler(ContextSampler): + def sample(self, n) -> None: + """ + Draw the first `n` samples in order from the specified split. + Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU. + """ + assert ( + n <= len(self.docs) + ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available." + return self.docs[:n] + + +class BalancedSampler(ContextSampler): + def sample(self, n) -> None: + """ + TODO: this should return approximately class-balanced samples from our fewshot examples. + TODO: what order should they be in? maybe random? + """ + + pass + + +class ManualSampler(ContextSampler): + def sample(self, n) -> None: + """ """ + pass + + +SAMPLER_REGISTRY = { + "default": ContextSampler, + "first_n": FirstNSampler, +} + + +def get_sampler(name): + try: + return SAMPLER_REGISTRY[name] + except KeyError: + raise ValueError( + f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}" + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/task.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/task.py new file mode 100644 index 0000000000000000000000000000000000000000..8a1a3bdbafac2d1c4c2cc7764a1e988e92183c53 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/task.py @@ -0,0 +1,1674 @@ +import abc +import ast +import logging +import random +import re +from collections.abc import Callable +from copy import deepcopy +from dataclasses import asdict, dataclass +from inspect import getsource +from typing import ( + Any, + Dict, + Iterable, + Iterator, + List, + Literal, + Mapping, + Optional, + Tuple, + Union, +) + +import datasets +import numpy as np +from tqdm import tqdm + +from lm_eval import utils +from lm_eval.api import samplers +from lm_eval.api.instance import Instance, OutputType +from lm_eval.api.metrics import bits_per_byte, mean, weighted_perplexity +from lm_eval.api.registry import ( + AGGREGATION_REGISTRY, + DEFAULT_METRIC_REGISTRY, + get_aggregation, + get_metric, + get_metric_aggregation, + is_higher_better, +) +from lm_eval.caching.cache import load_from_cache, save_to_cache +from lm_eval.filters import build_filter_ensemble +from lm_eval.prompts import get_prompt + + +ALL_OUTPUT_TYPES = [ + "loglikelihood", + "multiple_choice", + "loglikelihood_rolling", + "generate_until", +] + +eval_logger = logging.getLogger("lm-eval") + + +@dataclass +class TaskConfig(dict): + # task naming/registry + task: Optional[str] = None + task_alias: Optional[str] = None + tag: Optional[Union[str, list]] = None + group: Optional[Union[str, list]] = None + # HF dataset options. + # which dataset to use, + # and what splits for what purpose + dataset_path: Optional[str] = None + dataset_name: Optional[str] = None + dataset_kwargs: Optional[dict] = None + training_split: Optional[str] = None + validation_split: Optional[str] = None + test_split: Optional[str] = None + fewshot_split: Optional[str] = ( + None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?) + ) + # formatting / prompting options. + # see docs/advanced_task_guide.md for more info + process_docs: Optional[Callable] = None + doc_to_text: Optional[Union[Callable, str]] = None + doc_to_target: Optional[Union[Callable, str]] = None + doc_to_choice: Optional[Union[Callable, str, dict, list]] = None + process_results: Optional[Union[Callable, str]] = None + use_prompt: Optional[str] = None + description: str = "" + target_delimiter: str = " " + fewshot_delimiter: str = "\n\n" + fewshot_config: Optional[dict] = None + # runtime configuration options + num_fewshot: Optional[int] = None + # scoring options + metric_list: Optional[list] = None + output_type: OutputType = "generate_until" + generation_kwargs: Optional[dict] = None + repeats: int = 1 + filter_list: Optional[Union[str, list]] = None + should_decontaminate: bool = False + doc_to_decontamination_query: Optional[str] = None + metadata: Optional[dict] = ( + None # by default, not used in the code. allows for users to pass arbitrary info to tasks + ) + + def __post_init__(self) -> None: + if self.group is not None: + eval_logger.warning( + "A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information." + ) + + if self.tag is None: + self.tag = self.group + else: + raise ValueError( + "Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4." + ) + + if self.generation_kwargs is not None: + if self.output_type != "generate_until": + eval_logger.warning( + f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!" + ) + + if "temperature" in self.generation_kwargs: + self.generation_kwargs["temperature"] = float( + self.generation_kwargs["temperature"] + ) + + if "until" not in self.generation_kwargs: + self.generation_kwargs["until"] = [self.fewshot_delimiter] + else: + if self.output_type == "generate_until": + # ensure that we greedily generate in absence of explicit arguments otherwise + self.generation_kwargs = { + "until": ( + None + if self.fewshot_delimiter is None + else [self.fewshot_delimiter] + ), + "do_sample": False, + } + + def __getitem__(self, item): + return getattr(self, item) + + def __setitem__(self, item, value): + return setattr(self, item, value) + + def to_dict(self, keep_callable: bool = False) -> dict: + """dumps the current config as a dictionary object, as a printable format. + null fields will not be printed. + Used for dumping results alongside full task configuration + + :return: dict + A printable dictionary version of the TaskConfig object. + + # TODO: should any default value in the TaskConfig not be printed? + """ + cfg_dict = asdict(self) + # remove values that are `None` + for k, v in list(cfg_dict.items()): + if v is None: + cfg_dict.pop(k) + elif k == "metric_list": + for metric_dict in v: + for metric_key, metric_value in metric_dict.items(): + if callable(metric_value): + metric_dict[metric_key] = self.serialize_function( + metric_value, keep_callable=keep_callable + ) + cfg_dict[k] = v + elif callable(v): + cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable) + return cfg_dict + + def serialize_function( + self, value: Union[Callable, str], keep_callable=False + ) -> Union[Callable, str]: + """Serializes a given function or string. + + If 'keep_callable' is True, the original callable is returned. + Otherwise, attempts to return the source code of the callable using 'getsource'. + """ + if keep_callable: + return value + else: + try: + return getsource(value) + except (TypeError, OSError): + return str(value) + + +class Task(abc.ABC): + """A task represents an entire benchmark including its dataset, problems, + answers, and evaluation methods. See BoolQ for a simple example implementation + + A `doc` can be any python object which represents one instance of evaluation. + This is usually a dictionary e.g. + {"question": ..., "answer": ...} or + {"question": ..., question, answer) + """ + + VERSION: Optional[Union[int, str]] = None + + # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub + # or a path to a custom `datasets` loading script. + DATASET_PATH: Optional[str] = None + + # The name of a subset within `DATASET_PATH`. + DATASET_NAME: Optional[str] = None + + OUTPUT_TYPE: Optional[OutputType] = None + + def __init__( + self, + data_dir: Optional[str] = None, + cache_dir: Optional[str] = None, + download_mode: Optional[datasets.DownloadMode] = None, + config: Optional[Mapping] = None, # Union[dict, TaskConfig] + ) -> None: + """ + :param data_dir: str + Stores the path to a local folder containing the `Task`'s data files. + Use this to specify the path to manually downloaded data (usually when + the dataset is not publicly accessible). + :param cache_dir: str + The directory to read/write the `Task` dataset. This follows the + HuggingFace `datasets` API with the default cache directory located at: + `~/.cache/huggingface/datasets` + NOTE: You can change the cache location globally for a given process + to another directory: + `export HF_DATASETS_CACHE="/path/to/another/directory"` + :param download_mode: datasets.DownloadMode + How to treat pre-existing `Task` downloads and data. + - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS` + Reuse download and reuse dataset. + - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS` + Reuse download with fresh dataset. + - `datasets.DownloadMode.FORCE_REDOWNLOAD` + Fresh download and fresh dataset. + """ + self.download(data_dir, cache_dir, download_mode) + self._training_docs: Optional[list] = None + self._fewshot_docs: Optional[list] = None + self._instances: Optional[List[Instance]] = None + + self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig() + + self._filters = [build_filter_ensemble("none", [["take_first", None]])] + self.fewshot_rnd: Optional[random.Random] = ( + None # purposely induce errors in case of improper usage + ) + + def download( + self, + data_dir: Optional[str] = None, + cache_dir: Optional[str] = None, + download_mode=None, + ) -> None: + """Downloads and returns the task dataset. + Override this method to download the dataset from a custom API. + + :param data_dir: str + Stores the path to a local folder containing the `Task`'s data files. + Use this to specify the path to manually downloaded data (usually when + the dataset is not publicly accessible). + :param cache_dir: str + The directory to read/write the `Task` dataset. This follows the + HuggingFace `datasets` API with the default cache directory located at: + `~/.cache/huggingface/datasets` + NOTE: You can change the cache location globally for a given process + by setting the shell environment variable, `HF_DATASETS_CACHE`, + to another directory: + `export HF_DATASETS_CACHE="/path/to/another/directory"` + :param download_mode: datasets.DownloadMode + How to treat pre-existing `Task` downloads and data. + - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS` + Reuse download and reuse dataset. + - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS` + Reuse download with fresh dataset. + - `datasets.DownloadMode.FORCE_REDOWNLOAD` + Fresh download and fresh dataset. + """ + self.dataset = datasets.load_dataset( + path=self.DATASET_PATH, + name=self.DATASET_NAME, + data_dir=data_dir, + cache_dir=cache_dir, + download_mode=download_mode, + ) + + @property + def config(self) -> TaskConfig: + """Returns the TaskConfig associated with this class.""" + return self._config + + @abc.abstractmethod + def has_training_docs(self): + """Whether the task has a training set""" + pass + + @abc.abstractmethod + def has_validation_docs(self): + """Whether the task has a validation set""" + pass + + @abc.abstractmethod + def has_test_docs(self): + """Whether the task has a test set""" + pass + + def training_docs(self) -> Iterable: + """ + :return: Iterable[obj] + A iterable of any object, that doc_to_text can handle + """ + return [] + + def validation_docs(self) -> Iterable: + """ + :return: Iterable[obj] + A iterable of any object, that doc_to_text can handle + """ + return [] + + def test_docs(self) -> Iterable: + """ + :return: Iterable[obj] + A iterable of any object, that doc_to_text can handle + """ + return [] + + def fewshot_docs(self) -> Iterable: + """ + :return: Iterable[obj] + A iterable of any object, that doc_to_text can handle + """ + if self.has_training_docs(): + return self.training_docs() + elif self.has_validation_docs(): + return self.validation_docs() + else: + eval_logger.warning( + f"[Task: {self.config.task}] has_training_docs and has_validation_docs are False" + ", using test_docs as fewshot_docs but this is not recommended." + ) + return self.test_docs() + + def _process_doc(self, doc: dict) -> dict: + """ + Override this to process (detokenize, strip, replace, etc.) individual + documents. This can be used in a map over documents of a data split. + E.g. `map(self._process_doc, self.dataset["validation"])` + + :return: dict + The processed version of the specified `doc`. + """ + return doc + + @property + def instances(self) -> List[Instance]: + """After calling `task.build_all_requests()`, tasks + maintain a list of the dataset instances which will be evaluated. + """ + return self._instances + + def fewshot_examples(self, k, rnd): + if self._training_docs is None: + self._training_docs = list(self.training_docs()) + + return rnd.sample(self._training_docs, k) + + def doc_to_decontamination_query(self, doc): + raise NotImplementedError( + "Override doc_to_decontamination_query with document specific decontamination query." + ) + + @abc.abstractmethod + def doc_to_text(self, doc): + pass + + @abc.abstractmethod + def doc_to_target(self, doc): + pass + + def build_all_requests( + self, + *, + limit: Union[int, None] = None, + rank: int = 0, + world_size: int = 1, + cache_requests: bool = False, + rewrite_requests_cache: bool = False, + system_instruction: Optional[str] = None, + apply_chat_template: bool = False, + fewshot_as_multiturn: bool = False, + chat_template: Optional[Callable] = None, + tokenizer_name: str = "", + ) -> None: + """Build a set of Instances for a task, and store them in task.instances""" + + # used with caching + og_limit = limit + + cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}" + cache_key += "-chat_template" if apply_chat_template else "" + cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else "" + cache_key += ( + f"-system_prompt_hash{utils.hash_string(system_instruction)}" + if system_instruction is not None + else "" + ) + cache_key += f"-tokenizer{tokenizer_name}" + + cached_instances = load_from_cache(file_name=cache_key) + + if cache_requests and cached_instances and not rewrite_requests_cache: + cached_instances = cached_instances[:limit] + + flattened_instances = [ + instance + for instance_group in cached_instances + for instance in instance_group + ] + + self._instances = flattened_instances + return + + eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...") + + instances = [] + + # process all documents when caching is specified for simplicity + if ( + cache_requests + and (not cached_instances or rewrite_requests_cache) + and limit is not None + ): + limit = None + + doc_id_docs = list( + self.doc_iterator(rank=rank, limit=limit, world_size=world_size) + ) + + num_docs = len(doc_id_docs) + + for doc_id, doc in tqdm( + doc_id_docs, + total=num_docs, + ): + # sample fewshot context #TODO: need to offset doc_id by rank now! + fewshot_ctx = self.fewshot_context( + doc, + 0 if self.config.num_fewshot is None else self.config.num_fewshot, + system_instruction, + apply_chat_template, + fewshot_as_multiturn, + chat_template, + ) + + # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute + inst = self.construct_requests( + doc=doc, + ctx=fewshot_ctx, + metadata=(self.config["task"], doc_id, self.config.repeats), + ) + + if not isinstance(inst, list): + inst = [inst] + + instances.append(inst) + + # now flatten, this is to allow slicing to work with pickles + + sliced_instances = instances[:og_limit] + + flattened_instances = [ + instance + for instance_group in sliced_instances + for instance in instance_group + ] + + self._instances = flattened_instances + + if len(self._instances) == 0: + raise ValueError("task.build_requests() did not find any docs!") + + if cache_requests and (not cached_instances or rewrite_requests_cache): + save_to_cache(file_name=cache_key, obj=instances) + + @abc.abstractmethod + def construct_requests(self, doc, ctx, **kwargs): + """Uses RequestFactory to construct Requests and returns an iterable of + Requests which will be sent to the LM. + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param ctx: str + The context string, generated by fewshot_context. This includes the natural + language description, as well as the few shot examples, and the question + part of the document for `doc`. + :param doc_idx: int + The index of a document within `self.test_docs()` or `self.validation_docs()`, + whichever is the main split used. + :param repeats: int + TODO: update this docstring + The number of times each instance in a dataset is inferred on. Defaults to 1, + can be increased for techniques like majority voting. + """ + pass + + @abc.abstractmethod + def process_results(self, doc, results): + """Take a single document and the LM results and evaluates, returning a + dict where keys are the names of submetrics and values are the values of + the metric for that one document + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param results: + The results of the requests created in construct_requests. + """ + pass + + @abc.abstractmethod + def aggregation(self): + """ + :returns: {str: [metric_score] -> float} + A dictionary where keys are the names of submetrics and values are + functions that aggregate a list of metric scores + """ + pass + + @abc.abstractmethod + def higher_is_better(self): + """ + :returns: {str: bool} + A dictionary where keys are the names of submetrics and values are + whether a higher value of the submetric is better + """ + pass + + def get_config(self, key: str) -> Any: + return getattr(self._config, key, None) + + @classmethod + def count_bytes(cls, doc): + """Used for byte-level perplexity metrics in rolling loglikelihood""" + return len(doc.encode("utf-8")) + + @classmethod + def count_words(cls, doc): + """Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!""" + return len(re.split(r"\s+", doc)) + + @utils.positional_deprecated + def fewshot_context( + self, + doc, + num_fewshot, + rnd=None, + description=None, + ): + """Returns a fewshot context string that is made up of a prepended description + (if provided), the `num_fewshot` number of examples, and an appended prompt example. + + :param doc: str + The document as returned from training_docs, validation_docs, or test_docs. + :param num_fewshot: int + The number of fewshot examples to provide in the returned context string. + :param rnd: random.Random + The pseudo-random number generator used to randomly sample examples. + WARNING: This is currently a required arg although it's optionalized with a default `None`. + :param description: str + The task's description that will be prepended to the fewshot examples. + :returns: str + The fewshot context. + """ + if rnd is None: + if self.fewshot_rnd is not None: + rnd = self.fewshot_rnd + else: + raise ValueError( + "A `random.Random` generator argument must be provided to `rnd`" + ) + + description = description if description else "" + + if num_fewshot == 0: + labeled_examples = "" + else: + # for sets with no training docs, draw from other set *but ensure no overlap with current doc* + if self.has_training_docs(): + fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd) + else: + if self._fewshot_docs is None: + self._fewshot_docs = list( + self.validation_docs() + if self.has_validation_docs() + else self.test_docs() + ) + + fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1) + + # get rid of the doc that's the one we're evaluating, if it's in the fewshot + fewshotex = [x for x in fewshotex if x != doc][:num_fewshot] + + labeled_examples = ( + "\n\n".join( + [ + self.doc_to_text(doc) + self.doc_to_target(doc) + for doc in fewshotex + ] + ) + + "\n\n" + ) + + example = self.doc_to_text(doc) + return description + labeled_examples + example + + def apply_filters(self) -> Optional[List[Instance]]: + """Iterates over FilterEnsembles and applies them to instances""" + if hasattr(self, "_filters"): + for f in self._filters: + f.apply(self._instances) + else: + eval_logger.warning("No filter defined, passing through instances") + return self._instances + + def dump_config(self) -> dict: + """Returns the config as a dictionary.""" + # TODO: this should only return the overrides applied to a non-YAML task's configuration. + # (num_fewshot) + return self.config.to_dict() + + def set_config(self, key: str, value: Any, update: bool = False) -> None: + """Set or update the configuration for a given key.""" + if key is None: + raise ValueError("Key must be provided.") + + if update: + current_value = getattr(self._config, key, {}) + if not isinstance(current_value, dict): + raise TypeError( + f"Expected a dict for key '{key}', got {type(current_value).__name__} instead." + ) + current_value.update(value) + else: + setattr(self._config, key, value) + + def override_metric(self, metric_name: str) -> None: + """ + Override the default metrics used for evaluation with custom metrics. + + Parameters: + - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics. + """ + ( + self._metric_fn_list, + self._aggregation_list, + self._metric_fn_kwargs, + self._higher_is_better, + ) = ({}, {}, {}, {}) + self._metric_fn_list[metric_name] = get_metric(metric_name) + self._aggregation_list[metric_name] = get_metric_aggregation(metric_name) + self._higher_is_better[metric_name] = is_higher_better(metric_name) + self._metric_fn_kwargs[metric_name] = {} + if not isinstance(self, ConfigurableTask): + self.process_results = lambda x, y: {metric_name: get_metric(metric_name)} + self.aggregation = lambda: { + metric_name: get_metric_aggregation(metric_name) + } + setattr(self._config, "metric_list", [{"metric": metric_name}]) + setattr(self._config, "process_results", None) + + def set_fewshot_seed(self, seed: Optional[int] = None) -> None: + self.fewshot_rnd = random.Random(seed) + if hasattr(self, "sampler"): + self.sampler.rnd = self.fewshot_rnd + + @property + def eval_docs(self) -> Union[datasets.Dataset, List[dict]]: + if self.has_test_docs(): + return self.test_docs() + elif self.has_validation_docs(): + return self.validation_docs() + else: + raise ValueError( + f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!" + ) + + def doc_iterator( + self, *, rank: int = 0, limit: Union[int, None] = None, world_size: int = 1 + ) -> Iterator[Tuple[int, Any]]: + limit = int(limit) if limit else None + doc_iterator = utils.create_iterator( + enumerate(self.eval_docs), + rank=int(rank), + limit=limit, + world_size=int(world_size), + ) + return doc_iterator + + +class ConfigurableTask(Task): + VERSION = "Yaml" + OUTPUT_TYPE = None + CONFIG = None + + def __init__( + self, + data_dir=None, + cache_dir=None, + download_mode=None, + config: Optional[dict] = None, + ) -> None: # TODO no super() call here + # Get pre-configured attributes + self._config = self.CONFIG + + # Use new configurations if there was no preconfiguration + if self.config is None: + self._config = TaskConfig(**config) + # Overwrite configs + else: + if config is not None: + self._config.__dict__.update(config) + + if self.config is None: + raise ValueError( + "Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg" + ) + + if isinstance(self.config.metadata, dict): + if "version" in self.config.metadata: + self.VERSION = self.config.metadata["version"] + + if self.config.output_type is not None: + if self.config.output_type not in ALL_OUTPUT_TYPES: + raise ValueError( + f"Got invalid output_type '{self.config.output_type}', must be in '{','.join(ALL_OUTPUT_TYPES)}'" + ) + self.OUTPUT_TYPE = self.config.output_type + + if self.config.dataset_path is not None: + self.DATASET_PATH = self.config.dataset_path + + if self.config.dataset_name is not None: + self.DATASET_NAME = self.config.dataset_name + + self._metric_fn_list = {} + self._metric_fn_kwargs = {} + self._aggregation_list = {} + self._higher_is_better = {} + + if self.config.metric_list is None: + # TODO: handle this in TaskConfig.__post_init__ ? + _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type] + + for metric_name in _metric_list: + self._metric_fn_list[metric_name] = get_metric(metric_name) + self._metric_fn_kwargs[metric_name] = {} + self._aggregation_list[metric_name] = get_metric_aggregation( + metric_name + ) + self._higher_is_better[metric_name] = is_higher_better(metric_name) + else: + for metric_config in self.config.metric_list: + if "metric" not in metric_config: + raise ValueError( + "'metric' key not provided for an entry in 'metric_list', must be specified!" + ) + metric_name = metric_config["metric"] + kwargs = { + key: metric_config[key] + for key in metric_config + if key + not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"] + } + hf_evaluate_metric = ( + "hf_evaluate" in metric_config + and metric_config["hf_evaluate"] is True + ) + + if self.config.process_results is not None: + self._metric_fn_list[metric_name] = None + self._metric_fn_kwargs[metric_name] = {} + elif callable(metric_name): + metric_fn = metric_name.__call__ + metric_name = metric_name.__name__ + self._metric_fn_list[metric_name] = metric_fn + self._metric_fn_kwargs[metric_name] = kwargs + else: + self._metric_fn_list[metric_name] = get_metric( + metric_name, hf_evaluate_metric + ) + self._metric_fn_kwargs[metric_name] = kwargs + + if "aggregation" in metric_config: + agg_name = metric_config["aggregation"] + if isinstance(agg_name, str): + self._aggregation_list[metric_name] = get_aggregation(agg_name) + elif callable(agg_name): # noqa: E721 + self._aggregation_list[metric_name] = metric_config[ + "aggregation" + ] + else: + INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()} + metric_agg = get_metric_aggregation(metric_name) + eval_logger.warning( + f"[Task: {self.config.task}] metric {metric_name} is defined, but aggregation is not. " + f"using default " + f"aggregation={INV_AGG_REGISTRY[metric_agg]}" + ) + self._aggregation_list[metric_name] = metric_agg + + if "higher_is_better" in metric_config: + self._higher_is_better[metric_name] = metric_config[ + "higher_is_better" + ] + else: + eval_logger.warning( + f"[Task: {self.config.task}] metric {metric_name} is defined, but higher_is_better is not. " + f"using default " + f"higher_is_better={is_higher_better(metric_name)}" + ) + self._higher_is_better[metric_name] = is_higher_better(metric_name) + + self.download(self.config.dataset_kwargs) + self._training_docs = None + self._fewshot_docs = None + + if self.config.filter_list is not None: + self._filters = [] + for filter_config in self.config.filter_list: + filter_name = filter_config["name"] + filter_functions = filter_config["filter"] + components = [] + for function in filter_functions: + kwargs = { + key: function[key] for key in function if key != "function" + } + components.append([function["function"], kwargs]) + filter_pipeline = build_filter_ensemble(filter_name, components) + self._filters.append(filter_pipeline) + else: + self._filters = [build_filter_ensemble("none", [["take_first", None]])] + + if self.config.use_prompt is not None: + eval_logger.info(f"loading prompt {self.config.use_prompt}") + self.prompt = get_prompt( + self.config.use_prompt, self.DATASET_PATH, self.DATASET_NAME + ) + else: + self.prompt = None + + if self.fewshot_docs() is not None: + self.fewshot_rnd = ( + random.Random() + ) # setting with no seed, to be overridden at a later time + config_sampler: Union[str, Callable] = ( + self.config.fewshot_config.get("sampler", "default") + if self.config.fewshot_config + else "default" + ) + if isinstance(config_sampler, str): + self.sampler = samplers.get_sampler(config_sampler)( + list(self.fewshot_docs()), self, rnd=self.fewshot_rnd + ) + elif callable(config_sampler) and issubclass( + config_sampler, samplers.ContextSampler + ): + self.sampler = config_sampler( + docs=list(self.fewshot_docs()), task=self, rnd=self.fewshot_rnd + ) + else: + raise TypeError( + f"fewshot_config.sampler should be a string or callable of ContextSampler type, " + f"not {type(config_sampler)}" + ) + + self.task_docs = self.eval_docs + + # Test One Doc + self.features = list(self.task_docs.features.keys()) + self.multiple_input = 0 + self.multiple_target = 0 + test_doc = self.task_docs[0] + test_text = self.doc_to_text(test_doc) + test_target = self.doc_to_target(test_doc) + + if self.config.doc_to_choice is not None: + test_choice = self.doc_to_choice(test_doc) + if not isinstance(test_choice, list): + eval_logger.error("doc_to_choice must return list") + else: + num_choice = len(test_choice) + + if isinstance(test_text, int): + self.multiple_input = num_choice + else: + test_choice = None + + if isinstance(test_target, list): + self.multiple_target = len(test_target) + else: + if (isinstance(test_target, int)) and (test_choice is not None): + test_target = test_choice[test_target] + else: + test_target = str(test_target) + + if test_choice is not None: + check_choices = test_choice + else: + check_choices = [test_target] + if self.config.doc_to_choice is not None: + for choice in check_choices: + choice_has_whitespace = True if choice[0].isspace() else False + delimiter_has_whitespace = ( + True + if self.config.target_delimiter.rstrip() + != self.config.target_delimiter + else False + ) + + if delimiter_has_whitespace and choice_has_whitespace: + eval_logger.debug( + f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" have whitespace' + ) + elif (not delimiter_has_whitespace) and (not choice_has_whitespace): + eval_logger.debug( + f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace' + ) + + def download(self, dataset_kwargs: Optional[Dict[str, Any]] = None) -> None: + self.dataset = datasets.load_dataset( + path=self.DATASET_PATH, + name=self.DATASET_NAME, + **dataset_kwargs if dataset_kwargs is not None else {}, + ) + + def has_training_docs(self) -> bool: + if self.config.training_split is not None: + return True + else: + return False + + def has_validation_docs(self) -> bool: + if self.config.validation_split is not None: + return True + else: + return False + + def has_test_docs(self) -> bool: + if self.config.test_split is not None: + return True + else: + return False + + def training_docs(self) -> datasets.Dataset: + if self.has_training_docs(): + if self.config.process_docs is not None: + return self.config.process_docs( + self.dataset[self.config.training_split] + ) + return self.dataset[self.config.training_split] + + def validation_docs(self) -> datasets.Dataset: + if self.has_validation_docs(): + if self.config.process_docs is not None: + return self.config.process_docs( + self.dataset[self.config.validation_split] + ) + return self.dataset[self.config.validation_split] + + def test_docs(self) -> datasets.Dataset: + if self.has_test_docs(): + if self.config.process_docs is not None: + return self.config.process_docs(self.dataset[self.config.test_split]) + return self.dataset[self.config.test_split] + + def fewshot_docs(self): + if self.config.fewshot_split is not None: + if self.config.process_docs is not None: + return self.config.process_docs(self.dataset[self.config.fewshot_split]) + return self.dataset[self.config.fewshot_split] + elif ( + self.config.fewshot_config is not None + and self.config.fewshot_config.get("samples", None) is not None + ): + if isinstance(self.config.fewshot_config["samples"], list): + return self.config.fewshot_config["samples"] + elif callable(self.config.fewshot_config["samples"]): + return self.config.fewshot_config["samples"]() + else: + raise Exception( + "`fewshot_config['samples']` was incorrectly defined in the configuration. It should be either a list of samples as a dict, or function returning this list." + ) + else: + if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0): + eval_logger.warning( + f"[Task: {self.config.task}] " + "num_fewshot > 0 but fewshot_split is None. " + "using preconfigured rule." + ) + return super().fewshot_docs() + + @staticmethod + def append_target_question( + labeled_examples: List[Dict[str, str]], + question: str, + fewshot_as_multiturn: bool = False, + ) -> None: + """Adds a target question to the labeled examples list. + If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry. + Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant. + """ + if not fewshot_as_multiturn: + # if no messages or last message is system, append as new user entry + if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system": + labeled_examples.append({"role": "user", "content": question}) + # if last message is user, append to it to avoid two user messages in a row + else: + labeled_examples[-1]["content"] += question + else: + # if fewshot_as_multiturn is True, append as next user entry (last is always assistant) + labeled_examples.append({"role": "user", "content": question}) + + @utils.positional_deprecated + def fewshot_context( + self, + doc: str, + num_fewshot: int, + system_instruction: Optional[str] = None, + apply_chat_template: bool = False, + fewshot_as_multiturn: bool = False, + chat_template: Optional[Callable] = None, + ) -> str: + """Returns a fewshot context string that is made up of a prepended description + (if provided), the `num_fewshot` number of examples, and an appended prompt example. + + :param doc: str + The document as returned from training_docs, validation_docs, or test_docs. + :param num_fewshot: int + The number of fewshot examples to provide in the returned context string. + :param system_instruction: str + System instruction to be applied to the prompt. + :param apply_chat_template: bool + Whether to apply the chat template to the fewshot context. + :param fewshot_as_multiturn: bool + Whether to provide the fewshot examples as a multiturn conversation or a single user turn. + :param chat_template: Callable + Chat template to be applied to the fewshot context. + :returns: str + The fewshot context. + """ + + if apply_chat_template: + labeled_examples = [] + else: + labeled_examples = "" + + # get task description + if description := self.config.description: + description = utils.apply_template(self.config.description, doc) + + # create system prompt based on the provided system instruction and description + if system_instruction is not None and description: + system_prompt = ( + f"{system_instruction}{self.sampler.fewshot_delimiter}{description}" + ) + elif system_instruction is not None: + system_prompt = system_instruction + elif description: + system_prompt = description + else: + system_prompt = "" + + # add system prompt if specified + if system_prompt: + if apply_chat_template: + labeled_examples.append({"role": "system", "content": system_prompt}) + else: + labeled_examples = system_prompt + + # if few-shot - append examples after the system prompt + if num_fewshot > 0: + if apply_chat_template: + labeled_examples.extend( + self.sampler.get_chat_context( + doc, num_fewshot, fewshot_as_multiturn + ) + ) + else: + labeled_examples += self.sampler.get_context(doc, num_fewshot) + + example = self.doc_to_text(doc) + if apply_chat_template: + if self.multiple_input: + return chat_template(labeled_examples) + if isinstance(example, str): + self.append_target_question( + labeled_examples, example, fewshot_as_multiturn + ) + # for loglikelihood create a list of questions with appended choices + elif isinstance(example, list): + labeled_examples_list = [] + # copy chat history for each example and append the answer + for ex in example: + chat = deepcopy(labeled_examples) + self.append_target_question(chat, ex, fewshot_as_multiturn) + labeled_examples_list.append(chat_template(chat)) + return labeled_examples_list + # if example is an integer, append the choice or convert to string + elif isinstance(example, int): + if self.config.doc_to_choice is not None: + choices = self.doc_to_choice(doc) + self.append_target_question( + labeled_examples, choices[example], fewshot_as_multiturn + ) + else: + self.append_target_question( + labeled_examples, str(example), fewshot_as_multiturn + ) + # return lm.apply_chat_template(labeled_examples) + return chat_template(labeled_examples) + else: + if self.multiple_input: + return labeled_examples + if isinstance(example, str): + return labeled_examples + example + elif isinstance(example, list): + return [labeled_examples + ex for ex in example] + elif isinstance(example, int): + if self.config.doc_to_choice is not None: + choices = self.doc_to_choice(doc) + return labeled_examples + choices[example] + else: + return labeled_examples + str(example) + + def apply_filters(self): + """Iterates over FilterEnsembles and applies them to instances""" + if hasattr(self, "_filters"): + for f in self._filters: + f.apply(self._instances) + else: + eval_logger.warning("No filter defined, passing through instances") + return self._instances + + def should_decontaminate(self): + return self.config.should_decontaminate + + def doc_to_decontamination_query(self, doc): + if self.config.should_decontaminate: + if self.config.doc_to_decontamination_query is None: + return self.doc_to_text(doc) + else: + doc_to_decontamination_query = self.config.doc_to_decontamination_query + if doc_to_decontamination_query in self.features: + return doc[doc_to_decontamination_query] + elif callable(doc_to_decontamination_query): + return doc_to_decontamination_query(doc) + else: + return ast.literal_eval( + utils.apply_template( + self.config.doc_to_decontamination_query, doc + ) + ) + + def _process_doc(self, doc: dict) -> dict: + """ + Override this to process (detokenize, strip, replace, etc.) individual + documents. This can be used in a map over documents of a data split. + E.g. `map(self._process_doc, self.dataset["validation"])` + + :return: dict + The processed version of the specified `doc`. + """ + return doc + + def doc_to_text(self, doc, doc_to_text=None): + if self.prompt is not None: + doc_to_text = self.prompt + elif doc_to_text is not None: + doc_to_text = doc_to_text + else: + doc_to_text = self.config.doc_to_text + + if isinstance(doc_to_text, int): + return doc_to_text + elif isinstance(doc_to_text, str): + if doc_to_text in self.features: + # if self.config.doc_to_choice is not None: + # return self.doc_to_choice(doc)[doc[doc_to_text]] + # else: + return doc[doc_to_text] + else: + text_string = utils.apply_template(doc_to_text, doc) + if text_string.isdigit() and self._config.doc_to_choice is not None: + return ast.literal_eval(text_string) + else: + return text_string + elif callable(doc_to_text): + return doc_to_text(doc) + # Used when applying a Promptsource template + elif hasattr(doc_to_text, "apply"): + applied_prompt = doc_to_text.apply(doc) + if len(applied_prompt) == 2: + return applied_prompt[0] + else: + eval_logger.warning("Applied prompt returns empty string") + return self.config.fewshot_delimiter + else: + print(type(doc_to_text)) + raise TypeError + + def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]: + if self.prompt is not None: + doc_to_target = self.prompt + elif doc_to_target is not None: + doc_to_target = doc_to_target + else: + doc_to_target = self.config.doc_to_target + + if isinstance(doc_to_target, int): + return doc_to_target + elif isinstance(doc_to_target, str): + if doc_to_target in self.features: + # if self.config.doc_to_choice is not None: + # return self.doc_to_choice(doc)[doc[doc_to_target]] + # else: + return doc[doc_to_target] + else: + target_string = utils.apply_template(doc_to_target, doc) + if target_string.isdigit() and self._config.doc_to_choice is not None: + return ast.literal_eval(target_string) + elif ( + len(target_string) >= 2 + and (target_string[0] == "[") + and (target_string[-1] == "]") + ): + try: + return ast.literal_eval(target_string) + except (SyntaxError, ValueError): + return target_string + else: + return target_string + elif isinstance(doc_to_target, list): + return doc_to_target + elif callable(doc_to_target): + return doc_to_target(doc) + # Used when applying a Promptsource template + elif hasattr(doc_to_target, "apply"): + applied_prompt = doc_to_target.apply(doc) + if len(applied_prompt) == 2: + return applied_prompt[1] + else: + eval_logger.warning("Applied prompt returns empty string") + return self.config.fewshot_delimiter + else: + raise TypeError + + def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]: + if self.prompt is not None: + doc_to_choice = self.prompt + elif doc_to_choice is not None: + doc_to_choice = doc_to_choice + elif self.config.doc_to_choice is None: + eval_logger.error("doc_to_choice was called but not set in config") + else: + doc_to_choice = self.config.doc_to_choice + + if isinstance(doc_to_choice, str): + if doc_to_choice in self.features: + return doc[doc_to_choice] + else: + return ast.literal_eval(utils.apply_template(doc_to_choice, doc)) + elif isinstance(doc_to_choice, list): + return doc_to_choice + elif isinstance(doc_to_choice, dict): + return list(doc_to_choice.values()) + elif callable(doc_to_choice): + return doc_to_choice(doc) + elif hasattr(doc_to_choice, "get_answer_choices_list"): + return doc_to_choice.get_answer_choices_list(doc) + else: + raise TypeError + + def construct_requests( + self, doc: dict, ctx: str, **kwargs + ) -> Union[List[Instance], Instance]: + if self.OUTPUT_TYPE == "loglikelihood": + arguments = (ctx, self.doc_to_target(doc)) + elif self.OUTPUT_TYPE == "loglikelihood_rolling": + arguments = (self.doc_to_target(doc),) + elif self.OUTPUT_TYPE == "multiple_choice": + choices = self.doc_to_choice(doc) + target_delimiter = self.config.target_delimiter + if self.multiple_input: + # If there are multiple inputs, choices are placed in the ctx + cont = self.doc_to_target(doc) + arguments = [ + (ctx + choice, f"{target_delimiter}{cont}") for choice in choices + ] + else: + # Otherwise they are placed in the continuation + arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices] + + request_list = [ + Instance( + request_type="loglikelihood", + doc=doc, + arguments=arg, + idx=i, + **kwargs, + ) + for i, arg in enumerate(arguments) + ] + # TODO: we should raise a warning telling users this will at most ~2x runtime. + if "acc_mutual_info" in self._metric_fn_list.keys(): + # if we are calculating multiple choice accuracy + # using mutual information instead of raw loglikelihood as metric, need unconditional lls. + + # here mutual info refers to calculating + # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice)) + # in other words normalizing by subtracting the unconditional logprob of each choice. + request_list.extend( + [ + Instance( + request_type="loglikelihood", + doc=doc, + arguments=("", "{}".format(choice)), + idx=i, + **kwargs, + ) + for i, choice in enumerate(choices) + ] + ) + return request_list + + elif self.OUTPUT_TYPE == "generate_until": + arguments = (ctx, deepcopy(self.config.generation_kwargs)) + + return Instance( + request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs + ) + + def process_results(self, doc, results): + if callable(self.config.process_results): + return self.config.process_results(doc, results) + + result_dict = {} + use_metric = list(self._metric_fn_list.keys()) + if self.OUTPUT_TYPE == "loglikelihood": + results = results[0] + ll, is_greedy = results + return { + **({"perplexity": ll} if "perplexity" in use_metric else {}), + **({"acc": int(is_greedy)} if "acc" in use_metric else {}), + } + elif self.OUTPUT_TYPE == "loglikelihood_rolling": + (loglikelihood,) = results + _words = self.count_words(self.doc_to_target(doc)) + _bytes = self.count_bytes(self.doc_to_target(doc)) + return { + **( + {"word_perplexity": (loglikelihood, _words)} + if "word_perplexity" in use_metric + else {} + ), + **( + {"byte_perplexity": (loglikelihood, _bytes)} + if "byte_perplexity" in use_metric + else {} + ), + **( + {"bits_per_byte": (loglikelihood, _bytes)} + if "bits_per_byte" in use_metric + else {} + ), + } + elif self.OUTPUT_TYPE == "multiple_choice": + lls, is_greedy = zip(*results) + + # retrieve choices in List[str] form, to compute choice lengths, etc. + choices = self.doc_to_choice(doc) + completion_len = np.array([float(len(i)) for i in choices]) + + if ( + 2 * len(choices) == len(lls) + and "acc_mutual_info" in self._metric_fn_list.keys() + ): + # then we are doing mutual info. + # this stores the "dryrun" / unconditional answer loglikelihoods + lls_unconditional = lls[1::2] + if len(lls_unconditional) != len(choices): + raise ValueError + # and this stores our "regular" conditional loglikelihoods + lls = lls[::2] + + pred = np.argmax(lls) + pred_norm = np.argmax(lls / completion_len) + + if self.multiple_input: + gold = self.doc_to_text(doc) + else: + gold = self.doc_to_target(doc) + + gold_index_error = False + if isinstance(gold, list): + gold = [i if i < len(choices) else -100 for i in gold] + if -100 in gold: + gold_index_error = True + else: + if isinstance(gold, int): + gold = gold if gold < len(choices) else -100 + elif isinstance(gold, str): + gold = choices.index(gold) if gold in choices else -100 + + if gold == -100: + gold_index_error = True + + if gold_index_error: + eval_logger.warning( + f"Label index was not in within range of available choices," + f"Sample:\n\n{doc}\n\n" + ) + + if self.multiple_target: + acc = 1.0 if pred in gold else 0.0 + acc_norm = 1.0 if pred_norm in gold else 0.0 + exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold])) + else: + acc = 1.0 if pred == gold else 0.0 + acc_norm = 1.0 if pred_norm == gold else 0.0 + # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly + exact_match = int(is_greedy[gold]) if gold != -100 else 0 + + prob_norm = utils.softmax(lls) + + # TODO use keyword arguments to the metric? + # gold, pred, norm stuff, the original lls, + result_dict = { + **({"acc": acc} if "acc" in use_metric else {}), + **({"f1": (gold, pred)} if "f1" in use_metric else {}), + **({"mcc": (gold, pred)} if "mcc" in use_metric else {}), + **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}), + **({"exact_match": exact_match} if "exact_match" in use_metric else {}), + **( + {"brier_score": (gold, prob_norm)} + if "brier_score" in use_metric + else {} + ), + } + + if "acc_mutual_info" in use_metric: + lls_mutual_info = [ + ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional) + ] + acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0 + result_dict["acc_mutual_info"] = acc_mutual_info + + elif self.OUTPUT_TYPE == "generate_until": + gold = self.doc_to_target(doc) + result = results[0] + if self.config.doc_to_choice is not None: + # If you set doc_to_choice, + # it assumes that doc_to_target returns a number. + choices = self.doc_to_choice(doc) + gold = choices[gold] + # we expect multiple_targets to be a list. + elif self.multiple_target: + gold = list(gold) + elif type(gold) != type(result): + # cast gold to the same type as result + gold = type(result)(gold) + + for metric in self._metric_fn_list.keys(): + if self.multiple_target: + # in the case where we have multiple targets, + # return true if any are true + # TODO: this may break for multipLe_target, non zero-or-1 metrics + scores = [] + if not isinstance(gold, list): + # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer + # print(gold) + gold = [gold] + if metric == "exact_match": + result = [result for _ in range(len(gold))] + scores = self._metric_fn_list[metric]( + references=gold, + predictions=result, + **self._metric_fn_kwargs[metric], + )[metric] + result_score = 1.0 if scores > 0.0 else 0.0 + else: + for gold_option in gold: + try: + result_score = self._metric_fn_list[metric]( + references=[gold_option], + predictions=[result], + **self._metric_fn_kwargs[metric], + ) + except ( + TypeError + ): # TODO: this is hacky and I don't want to do it + result_score = self._metric_fn_list[metric]( + [gold_option, result] + ) + if isinstance(result_score, dict): + # TODO: this handles the case where HF evaluate returns a dict. + result_score = result_score[metric] + scores.append(result_score) + if any(scores): + result_score = 1.0 + else: + result_score = 0.0 + else: + try: + result_score = self._metric_fn_list[metric]( + references=[gold], + predictions=[result], + **self._metric_fn_kwargs[metric], + ) + except TypeError: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics + result_score = self._metric_fn_list[metric]([gold, result]) + if isinstance(result_score, dict): + # TODO: this handles the case where HF evaluate returns a dict. + result_score = result_score[metric] + result_dict[metric] = result_score + else: + raise ValueError( + f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ", + "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'", + ) + + return result_dict + + def aggregation(self) -> dict: + return self._aggregation_list + + def higher_is_better(self) -> dict: + return self._higher_is_better + + def get_config(self, key: str) -> Any: + return getattr(self._config, key, None) + + @property + def task_name(self) -> Any: + return getattr(self.config, "task", None) + + def __repr__(self): + return ( + f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," + f"output_type={self.OUTPUT_TYPE}," + f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," + f"num_samples={len(self.eval_docs)})" + ) + + +class MultipleChoiceTask(Task): + OUTPUT_TYPE = "loglikelihood" + + def doc_to_target(self, doc: dict) -> str: + return " " + doc["choices"][doc["gold"]] + + def construct_requests(self, doc: dict, ctx: str, **kwargs) -> List[Instance]: + # TODO: add mutual info here? + return [ + Instance( + request_type="loglikelihood", + doc=doc, + arguments=(ctx, " {}".format(choice)), + idx=i, + **kwargs, + ) + for i, choice in enumerate(doc["choices"]) + ] + + def process_results(self, doc: dict, results: Iterable[Tuple[float, bool]]) -> dict: + results = [ + res[0] for res in results + ] # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere? + gold = doc["gold"] + + acc = 1.0 if np.argmax(results) == gold else 0.0 + completion_len = np.array([float(len(i)) for i in doc["choices"]]) + acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0 + + return { + "acc": acc, + "acc_norm": acc_norm, + } + + def higher_is_better(self) -> dict: + return { + "acc": True, + "acc_norm": True, + } + + def aggregation(self) -> dict: + return { + "acc": mean, + "acc_norm": mean, + } + + +class PerplexityTask(Task): + OUTPUT_TYPE = "loglikelihood_rolling" + + def has_training_docs(self) -> bool: + return False + + def fewshot_examples(self, k: int, rnd) -> List: + if k != 0: + raise ValueError( + "The number of fewshot examples must be 0 for perplexity tasks." + ) + return [] + + def fewshot_context(self, doc: dict, num_fewshot: int) -> Literal[""]: + if num_fewshot != 0: + raise ValueError( + "The number of fewshot examples must be 0 for perplexity tasks." + ) + + return "" + + def higher_is_better(self) -> dict: + return { + "word_perplexity": False, + "byte_perplexity": False, + "bits_per_byte": False, + } + + def doc_to_decontamination_query(self, doc): + return doc + + def doc_to_text(self, doc) -> str: + return "" + + def doc_to_target(self, doc): + return doc + + def construct_requests(self, doc: dict, ctx: Optional[str], **kwargs): + if bool(ctx): + raise ValueError + + return Instance( + request_type=self.OUTPUT_TYPE, + doc=doc, + arguments=(self.doc_to_target(doc),), + idx=0, + **kwargs, + ) + + def process_results(self, doc: dict, results: Tuple[float]) -> dict: + (loglikelihood,) = results + words = self.count_words(self.doc_to_target(doc)) + bytes_ = self.count_bytes(self.doc_to_target(doc)) + return { + "word_perplexity": (loglikelihood, words), + "byte_perplexity": (loglikelihood, bytes_), + "bits_per_byte": (loglikelihood, bytes_), + } + + def aggregation(self) -> dict: + return { + "word_perplexity": weighted_perplexity, + "byte_perplexity": weighted_perplexity, + "bits_per_byte": bits_per_byte, + } + + @classmethod + def count_bytes(cls, doc) -> int: + return len(doc.encode("utf-8")) + + @classmethod + def count_words(cls, doc) -> int: + """Downstream tasks with custom word boundaries should override this!""" + return len(re.split(r"\s+", doc)) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__init__.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b2441eb878de8d3b58af798ba9f19cda6f82d19 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/__init__.py @@ -0,0 +1,28 @@ +from . import ( + anthropic_llms, + api_models, + dummy, + gguf, + huggingface, + mamba_lm, + nemo_lm, + neuralmagic, + neuron_optimum, + openai_completions, + optimum_lm, + textsynth, + vllm_causallms, +) + + +# TODO: implement __all__ + + +try: + # enable hf hub transfer if available + import hf_transfer # type: ignore # noqa + import huggingface_hub.constants # type: ignore + + huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True +except ImportError: + pass diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11e2b91028f60e85447c7a95158bf995100435ac Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/anthropic_llms.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/anthropic_llms.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f678aba7d0f7a425b5129bae2443a628c82b41d Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/anthropic_llms.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/api_models.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/api_models.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..afec2e2a31d51bba5822f01cd77a06a87f87544d Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/api_models.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e6faeb301623eeccca376dc77cd995a34c35b1d Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/gguf.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/gguf.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7604b9c1525cb468992dfaad65c9c783ea52210d Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/gguf.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93e500d2eca76e1207d1a00726006e8da98ef914 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/mamba_lm.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/mamba_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..084b4cf68cd4e4574e9d0beb6a5e5c383ec27295 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/mamba_lm.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/nemo_lm.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/nemo_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5c2efce3c332b0c1a66434711c1367d5e6c5354 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/nemo_lm.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuralmagic.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuralmagic.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e258ef4e21ce22dd3ef7276fe98d0dd0e6d187f0 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuralmagic.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuron_optimum.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuron_optimum.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d01089447ff94a5c9e418f77c5ba76ad25f74d70 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuron_optimum.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/openai_completions.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/openai_completions.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0d05f33713787ccf829864778e927524dfbf6ce Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/openai_completions.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/optimum_lm.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/optimum_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2fff494548b18c41a3aeba826bb86f699c1f5fef Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/optimum_lm.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ff02eda48776144fc5b4c3a52488b985e92e06e Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/utils.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfa43f9455d6cc0538661f3e5a8d8424540ff52c Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/utils.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/vllm_causallms.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/vllm_causallms.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d712a903aaefe0cea9eb9e7e8cbdc3dc7852851b Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/vllm_causallms.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/anthropic_llms.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/anthropic_llms.py new file mode 100644 index 0000000000000000000000000000000000000000..7b22b6a979ca12f6a68af7a16e3c50a8ad233ddf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/anthropic_llms.py @@ -0,0 +1,362 @@ +import os +from functools import cached_property +from typing import Any, Dict, List, Tuple, Union + +from tqdm import tqdm + +from lm_eval import utils +from lm_eval.api.model import LM +from lm_eval.api.registry import register_model +from lm_eval.models.openai_completions import LocalCompletionsAPI +from lm_eval.models.utils import retry_on_specific_exceptions + + +eval_logger = utils.eval_logger + + +def anthropic_completion( + client, #: anthropic.Anthropic, + model: str, + prompt: str, + max_tokens_to_sample: int, + temperature: float, + stop: List[str], + **kwargs: Any, +) -> str: + """Wrapper function around the Anthropic completion API client with exponential back-off + in case of RateLimitError. + + params: + client: anthropic.Anthropic + Anthropic API client + model: str + Anthropic model e.g. 'claude-instant-v1', 'claude-2' + prompt: str + Prompt to feed to the model + max_tokens_to_sample: int + Maximum number of tokens to sample from the model + temperature: float + Sampling temperature + stop: List[str] + List of stop sequences + kwargs: Any + Additional model_args to pass to the API client + """ + + try: + import anthropic + except ModuleNotFoundError: + raise Exception( + "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ +please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", + ) + + def _exception_callback(e: Exception, sleep_time: float) -> None: + eval_logger.warning( + f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds" + ) + + @retry_on_specific_exceptions( + on_exceptions=[anthropic.RateLimitError], + max_retries=None, # retry forever, consider changing + on_exception_callback=_exception_callback, + ) + def completion(): + response = client.completions.create( + prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}", + model=model, + # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences + # (e.g. gsm8k's ":") may truncate a lot of the input. + stop_sequences=[anthropic.HUMAN_PROMPT] + stop, + max_tokens_to_sample=max_tokens_to_sample, + temperature=temperature, + **kwargs, + ) + return response.completion + + return completion() + + +def anthropic_chat( + client, #: anthropic.Anthropic, + model: str, + prompt: str, + max_tokens: int, + temperature: float, + stop: List[str], + **kwargs: Any, +) -> str: + """Wrapper function around the Anthropic completion API client with exponential back-off + in case of RateLimitError. + + params: + client: anthropic.Anthropic + Anthropic API client + model: str + Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229' + prompt: str + Prompt to feed to the model + max_tokens: int + Maximum number of tokens to sample from the model + temperature: float + Sampling temperature + stop: List[str] + List of stop sequences + kwargs: Any + Additional model_args to pass to the API client + """ + + try: + import anthropic + except ModuleNotFoundError: + raise Exception( + "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ +please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", + ) + + def _exception_callback(e: Exception, sleep_time: float) -> None: + eval_logger.warning( + f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds" + ) + + @retry_on_specific_exceptions( + on_exceptions=[ + anthropic.RateLimitError, + anthropic.APIConnectionError, + anthropic.APIStatusError, + ], + max_retries=None, # retry forever, consider changing + on_exception_callback=_exception_callback, + ) + def messages(): + response = client.messages.create( + model=model, + max_tokens=max_tokens, + temperature=temperature, + messages=[{"role": "user", "content": f"{prompt}"}], + **kwargs, + ) + return response.content[0].text + + return messages() + + +@register_model("anthropic-completions") +class AnthropicLM(LM): + REQ_CHUNK_SIZE = 20 # TODO: not used + + def __init__( + self, + batch_size: int = 1, + model: str = "claude-2.0", + max_tokens_to_sample: int = 256, + temperature: float = 0, # defaults to 1 + **kwargs, # top_p, top_k, etc. + ) -> None: + """Anthropic API wrapper. + + :param model: str + Anthropic model e.g. 'claude-instant-v1', 'claude-2' + :param max_tokens_to_sample: int + Maximum number of tokens to sample from the model + :param temperature: float + Sampling temperature + :param kwargs: Any + Additional model_args to pass to the API client + """ + super().__init__() + + try: + import anthropic + except ModuleNotFoundError: + raise Exception( + "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ +please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", + ) + + self.model = model + # defaults to os.environ.get("ANTHROPIC_API_KEY") + self.client = anthropic.Anthropic() + self.temperature = temperature + self.max_tokens_to_sample = max_tokens_to_sample + self.tokenizer = self.client.get_tokenizer() + self.kwargs = kwargs + + @property + def eot_token_id(self): + # Not sure but anthropic.HUMAN_PROMPT ? + raise NotImplementedError("No idea about anthropic tokenization.") + + @property + def max_length(self) -> int: + return 2048 + + @property + def max_gen_toks(self) -> int: + return self.max_tokens_to_sample + + @property + def batch_size(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError("No support for logits.") + + @property + def device(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError("No support for logits.") + + def tok_encode(self, string: str) -> List[int]: + return self.tokenizer.encode(string).ids + + def tok_decode(self, tokens: List[int]) -> str: + return self.tokenizer.decode(tokens) + + def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False): + raise NotImplementedError("No support for logits.") + + def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: + try: + import anthropic + except ModuleNotFoundError: + raise Exception( + "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ +please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", + ) + + if not requests: + return [] + + _requests: List[Tuple[str, dict]] = [req.args for req in requests] + + res = [] + for request in tqdm(_requests, disable=disable_tqdm): + try: + inp = request[0] + request_args = request[1] + # generation_kwargs + until = request_args.get("until") + max_gen_toks = request_args.get("max_gen_toks", self.max_length) + temperature = request_args.get("temperature", self.temperature) + response = anthropic_completion( + client=self.client, + model=self.model, + prompt=inp, + max_tokens_to_sample=max_gen_toks, + temperature=temperature, # TODO: implement non-greedy sampling for Anthropic + stop=until, # type: ignore + **self.kwargs, + ) + res.append(response) + + self.cache_hook.add_partial("generate_until", request, response) + except anthropic.APIConnectionError as e: # type: ignore # noqa: F821 + eval_logger.critical(f"Server unreachable: {e.__cause__}") + break + except anthropic.APIStatusError as e: # type: ignore # noqa: F821 + eval_logger.critical(f"API error {e.status_code}: {e.message}") + break + + return res + + def _model_call(self, inps): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + def _model_generate(self, context, max_length, eos_token_id): + # Isn't used because we override generate_until + raise NotImplementedError() + + def loglikelihood(self, requests, disable_tqdm: bool = False): + raise NotImplementedError("No support for logits.") + + def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): + raise NotImplementedError("No support for logits.") + + +@register_model("anthropic-chat", "anthropic-chat-completions") +class AnthropicChat(LocalCompletionsAPI): + def __init__( + self, + base_url="https://api.anthropic.com/v1/messages", + tokenizer_backend=None, + **kwargs, + ): + super().__init__( + base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs + ) + eval_logger.warning( + "Chat completions does not support batching. Defaulting to batch size 1." + ) + self._batch_size = 1 + self.anthropic_version = "2023-06-01" + eval_logger.warning( + f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning" + ) + + @cached_property + def api_key(self): + """Override this property to return the API key for the API request.""" + key = os.environ.get("ANTHROPIC_API_KEY", None) + if key is None: + raise ValueError( + "API key not found. Please set the ANTHROPIC_API_KEY environment variable." + ) + return key + + @cached_property + def header(self): + return { + "x-api-key": f"{self.api_key}", + "anthropic-version": self.anthropic_version, + } + + def _create_payload( + self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs + ) -> dict: + system = ( + messages[0].get("content") if messages[0].get("role") == "system" else None + ) + if system: + messages = messages[1:] + gen_kwargs.pop("do_sample", False) + max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks) + temperature = gen_kwargs.pop("temperature", 0) + stop = gen_kwargs.pop("until", ["\n\nHuman:"]) + if not isinstance(stop, list): + stop = [stop] + out = { + "messages": messages, + "model": self.model, + "max_tokens": max_tokens, + "temperature": temperature, + "stop_sequences": stop, + **gen_kwargs, + } + if system: + out["system"] = system + return out + + def parse_generations( + self, outputs: Union[Dict, List[Dict]], **kwargs + ) -> List[str]: + res = [] + if not isinstance(outputs, list): + outputs = [outputs] + for out in outputs: + for choices in out["content"]: + res.append(choices["text"]) + return res + + def tok_encode( + self, + string: str, + left_truncate_len=None, + add_special_tokens=None, + **kwargs, + ) -> List[str]: + return [string] + + def loglikelihood(self, requests, **kwargs): + raise NotImplementedError( + "Anthropic Chat Completions API does not support the return of loglikelihood" + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/api_models.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/api_models.py new file mode 100644 index 0000000000000000000000000000000000000000..ff72925cad9e53d090d42b3c0794edb784e1d614 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/api_models.py @@ -0,0 +1,641 @@ +import abc +import asyncio +import copy +import itertools +import json +from functools import cached_property +from typing import ( + Any, + Awaitable, + Callable, + Dict, + Iterable, + List, + Literal, + NamedTuple, + Optional, + Tuple, + Union, +) + + +try: + import requests + from aiohttp import ClientSession, TCPConnector + from tenacity import RetryError, retry, stop_after_attempt, wait_exponential + from tqdm import tqdm + from tqdm.asyncio import tqdm_asyncio +except ModuleNotFoundError: + pass + + +from importlib.util import find_spec + +from lm_eval import utils +from lm_eval.api.instance import Instance +from lm_eval.api.model import TemplateLM +from lm_eval.models.utils import Collator, chunks, configure_pad_token + + +LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]] + + +# utility class to keep track of json encoded chats +class JsonChatStr(NamedTuple): + prompt: str + + def encode(self, encoding): + return self.prompt.encode(encoding) + + +eval_logger = utils.eval_logger + + +class TemplateAPI(TemplateLM): + def __init__( + self, + model: str = None, + pretrained: str = None, # `model` takes precedence over `pretrained` when passed. + base_url: str = None, + tokenizer: Optional[str] = None, + # Logliklehood tasks require a tokenizer to calculate context lengths, + # however the requests can be sent as a string if the API doesn't support token inputs. + # use tokenized_requests=False + tokenizer_backend: Optional[ + Literal["tiktoken", "huggingface", None] + ] = "huggingface", + truncate: bool = False, + # number of concurrent requests. More useful if not batching + num_concurrent: int = 1, + max_retries: int = 3, + max_gen_toks: int = 256, + batch_size: Union[str, int] = 1, + seed: int = 1234, + max_length: Optional[int] = 2048, + add_bos_token: bool = False, + custom_prefix_token_id=None, + # send the requests as tokens or strings + tokenized_requests=True, + **kwargs, + ) -> None: + super().__init__() + missing_packages = [ + pkg + for pkg in ["aiohttp", "tqdm", "tenacity", "requests"] + if find_spec(pkg) is None + ] + if missing_packages: + raise ModuleNotFoundError( + f"Attempted to use an API model, but the required packages {missing_packages} are not installed. " + 'Please install these via `pip install lm-eval[api]` or `pip install -e ."[api]"`' + ) + self.model = model or pretrained + self.base_url = base_url + self.tokenizer = tokenizer + if not isinstance(batch_size, int) and "auto" in batch_size: + eval_logger.warning( + "Automatic batch size is not supported for API models. Defaulting to batch size 1." + ) + elif int(batch_size) > 1: + eval_logger.warning( + "Batch size > 1 detected. Ensure your API supports batched requests with varying total sequence lengths." + ) + self._batch_size = int(batch_size) if batch_size != "auto" else 1 + self._truncate = truncate + self._max_gen_toks = int(max_gen_toks) + self._seed = int(seed) + self.max_length = max_length + if int(num_concurrent) <= 1: + eval_logger.info( + "Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1." + ) + self._concurrent = int(num_concurrent) + self.tokenizer_backend = tokenizer_backend + self.add_bos_token = add_bos_token + self.custom_prefix_token_id = custom_prefix_token_id + self.tokenized_requests = tokenized_requests + self.max_retries = int(max_retries) + + eval_logger.info(f"Using tokenizer {self.tokenizer_backend}") + if self.tokenizer_backend is None: + self.tokenizer = None + self.tokenized_requests = False + else: + if self.tokenizer is None: + if self.tokenizer_backend == "huggingface": + import transformers + + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + self.tokenizer if self.tokenizer else self.model + ) + # Not used as the API will handle padding but to mirror the behavior of the HFLM + self.tokenizer = configure_pad_token(self.tokenizer) + elif self.tokenizer_backend == "tiktoken": + try: + import tiktoken + + self.tokenizer = tiktoken.encoding_for_model(self.model) + except ModuleNotFoundError as e: + raise Exception( + "Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. " + "Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`." + ) from e + if "openai" not in self.base_url: + eval_logger.warning( + f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. " + "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken." + ) + else: + import transformers + + assert isinstance(tokenizer, str), "tokenizer must be a string" + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer, + ) + + @abc.abstractmethod + def _create_payload( + self, + messages: Union[List[List[int]], List[dict], List[str], str], + *, + generate: bool = True, + gen_kwargs: Optional[dict] = None, + seed: int = 1234, + **kwargs, + ) -> dict: + """This method is responsible for creating the json payload that will be sent to the API.""" + raise NotImplementedError + + def create_message( + self, + messages: Union[List[List[int]], List[str], List[JsonChatStr]], + generate=False, + ) -> Union[List[List[int]], List[dict], List[str], str]: + """Helper method to transform the prompt into the expected API input format. messages consist of batched requests""" + if isinstance(messages[0], JsonChatStr): + # for chat completions we need to decode the json string to list[dict,...] + assert ( + self._batch_size == 1 + ), "non-tokenized chat requests are only supported with batch_size=1" + # list[dict["role":..., "content":...],...] + return json.loads(messages[0].prompt) + + if not self.tokenized_requests: + # if messages are tokenized: + if isinstance(messages[0][0], int): + # assuming decoding is lossless. However, this is only for logliklehood requests + # as we need to compute the context length. For generations, we don't need to tokenize. + messages = self.decode_batch(messages) + if self._batch_size <= 1: + # if batch is 1 return str + return messages[0] + else: + # list[str,...] + return messages + + # list[list[int], ...] + return messages + + @staticmethod + @abc.abstractmethod + def parse_logprobs( + outputs: Union[Any, List[Any]], + tokens: List[List[int]] = None, + ctxlen: List[int] = None, + **kwargs, + ) -> List[Tuple[float, bool]]: + """Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples""" + raise NotImplementedError + + @staticmethod + @abc.abstractmethod + def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]: + """Method used to parse the generations from the (batched) API response. This method should return a list of str""" + raise NotImplementedError + + @cached_property + def api_key(self) -> str: + """Override this property to return the API key for the API request.""" + return "" + + @cached_property + def header(self) -> dict: + """Override this property to return the headers for the API request.""" + return {"Authorization": f"Bearer {self.api_key}"} + + @property + def chat_template(self) -> str: + """Must be defined for LM subclasses that implement Chat Templating. + Should return the structure of the chat template applied to user/assistant messages. + Only used for logging and reproducibility. + """ + return "" + + @property + def tokenizer_name(self) -> str: + """Must be defined for LM subclasses which implement Chat Templating. + Should return the name of the tokenizer or chat template used. + Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used. + """ + return "" + + def apply_chat_template( + self, chat_history: List[Dict[str, str]] + ) -> Union[str, JsonChatStr]: + """Applies a chat template to a list of chat history between user and model.""" + if self.tokenizer_backend == "huggingface" and self.tokenized_requests: + return self.tokenizer.apply_chat_template( + chat_history, tokenize=False, add_generation_prompt=True + ) + else: + # bit of a hack. We'll load back before sending to the API + return JsonChatStr(json.dumps(chat_history)) + + @cached_property + def eot_token_id(self) -> Optional[int]: + if self.tokenizer is None: + return None + else: + if self.tokenizer_backend == "huggingface": + return self.tokenizer.eos_token_id + elif self.tokenizer_backend == "tiktoken": + return self.tokenizer.eot_token + + @cached_property + def prefix_token_id(self) -> Optional[int]: + if self.tokenizer is None: + return None + else: + if self.custom_prefix_token_id is not None: + return self.custom_prefix_token_id + if self.tokenizer_backend == "huggingface": + if self.tokenizer.bos_token_id is not None: + return self.tokenizer.bos_token_id + return self.tokenizer.eos_token_id + else: + return self.tokenizer.eot_token + + def tok_encode( + self, + string: str, + left_truncate_len: int = None, + add_special_tokens: bool = False, + truncation: bool = False, + **kwargs, + ) -> Union[List[List[int]], List[int], List[str]]: + if self.tokenizer_backend is None: + return [string] + elif self.tokenizer_backend == "huggingface": + # by default for CausalLM - false or self.add_bos_token is set + if not add_special_tokens: + add_special_tokens = False or self.add_bos_token + encoding: Union[List[List[int]], List[int]] = self.tokenizer( + string, + add_special_tokens=add_special_tokens, + truncation=truncation, + return_attention_mask=False, + ).input_ids + + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + if not isinstance(string, str): + encoding = [enc[-left_truncate_len:] for enc in encoding] + else: + encoding = encoding[-left_truncate_len:] + + return encoding + + else: + try: + encoding = self.tokenizer.encode(string) + except Exception: + encoding = self.tokenizer.encode_batch(string) + return encoding + + def decode_batch(self, tokens: List[List[int]]) -> List[str]: + if self.tokenizer_backend == "huggingface": + return self.tokenizer.batch_decode(tokens) + elif self.tokenizer_backend == "tiktoken": + return self.tokenizer.decode_batch(tokens) + + def model_call( + self, + messages: Union[List[List[int]], List[str], List[JsonChatStr]], + *, + generate: bool = True, + gen_kwargs: Optional[Dict] = None, + **kwargs, + ) -> Optional[dict]: + # !!! Copy: shared dict for each request, need new object !!! + gen_kwargs = copy.deepcopy(gen_kwargs) + try: + response = requests.post( + self.base_url, + json=self._create_payload( + self.create_message(messages), + generate=generate, + gen_kwargs=gen_kwargs, + seed=self._seed, + **kwargs, + ), + headers=self.header, + ) + if not response.ok: + eval_logger.warning( + f"API request failed with error message: {response.text}. Retrying..." + ) + response.raise_for_status() + return response.json() + except RetryError: + eval_logger.error( + "API request failed after multiple retries. Please check the API status." + ) + return None + + async def amodel_call( + self, + session: ClientSession, + messages: Union[List[List[int]], List[str], List[JsonChatStr]], + *, + generate: bool = True, + cache_keys: list = None, + ctxlens: Optional[List[int]] = None, + gen_kwargs: Optional[Dict] = None, + **kwargs, + ) -> Union[List[str], List[Tuple[float, bool]], None]: + # !!! Copy: shared dict for each request, need new object !!! + gen_kwargs = copy.deepcopy(gen_kwargs) + payload = self._create_payload( + self.create_message(messages), + generate=generate, + gen_kwargs=gen_kwargs, + seed=self._seed, + **kwargs, + ) + cache_method = "generate_until" if generate else "loglikelihood" + try: + async with session.post( + self.base_url, + json=payload, + headers=self.header, + ) as response: + if not response.ok: + error_text = await response.text() + eval_logger.warning( + f"API request failed with error message: {error_text}. Retrying..." + ) + # raising exception will retry the request + response.raise_for_status() + outputs = await response.json() + answers = ( + self.parse_generations( + outputs=outputs, + ) + if generate + else self.parse_logprobs( + outputs=outputs, + tokens=messages, + ctxlens=ctxlens, + ) + ) + if cache_keys: + for res, cache in zip(answers, cache_keys): + self.cache_hook.add_partial(cache_method, cache, res) + return answers + # If the retries also fail + except RetryError: + eval_logger.error( + "API request failed after multiple retries. Please check the API status." + ) + return None + + def batch_logliklehood_requests( + self, chunks: Iterable[List[LogLikelihoodInputs]] + ) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]: + inputs = [] + ctxlens = [] + cache_keys = [] + for chunk in chunks: + for cache_key, context_enc, continuation_enc in chunk: + inp = (context_enc + continuation_enc)[-(self.max_length) :] + ctxlen = len(context_enc) - max( + 0, len(context_enc) + len(continuation_enc) - (self.max_length) + ) + + inputs.append(inp) + ctxlens.append(ctxlen) + cache_keys.append(cache_key) + return inputs, ctxlens, cache_keys + + async def get_batched_requests( + self, + requests: list, + cache_keys: list, + *, + generate: bool = True, + ctxlens: List[int] = None, + **kwargs, + ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]: + ctxlens = ctxlens if ctxlens else [None] * len(requests) + conn = TCPConnector(limit=self._concurrent) + async with ClientSession(connector=conn) as session: + retry_: Callable[..., Awaitable[Any]] = retry( + stop=stop_after_attempt(self.max_retries), + wait=wait_exponential(multiplier=0.5, min=1, max=10), + reraise=True, + )(self.amodel_call) + # Create tasks for each batch of request + tasks = [ + asyncio.create_task( + retry_( + session=session, + messages=message, + cache_keys=cache_key, + generate=generate, + ctxlens=ctxlen, + **kwargs, + ) + ) + for message, cache_key, ctxlen in zip( + chunks(requests, n=self._batch_size), + chunks(cache_keys, n=self._batch_size), + chunks(ctxlens, n=self._batch_size), + ) + ] + + return await tqdm_asyncio.gather(*tasks, desc="Requesting API") + + def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]: + assert ( + self.tokenizer is not None + ), "Tokenizer is required for loglikelihood tasks to compute context lengths." + res = [] + + def _collate(req: LogLikelihoodInputs): + """Defines the key for the sorted method""" + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + + toks = req[1] + req[2] + return -len(toks), tuple(toks) + + re_ord = Collator( + requests, + sort_fn=_collate, + group_by=None, + ) + # if concurrent then we'll batch in the async context + chunked = re_ord.get_batched(n=self._batch_size if self._concurrent <= 1 else 0) + if self._concurrent <= 1: + pbar = tqdm(desc="Requesting API", total=len(requests)) + for chunk in chunked: + inputs, ctxlens, cache_keys = self.batch_logliklehood_requests([chunk]) + + outputs = retry( + stop=stop_after_attempt(self.max_retries), + wait=wait_exponential(multiplier=0.5, min=1, max=10), + reraise=True, + )(self.model_call)(messages=inputs, generate=False) + if isinstance(outputs, dict): + outputs = [outputs] + for answer_, cache_key in zip( + self.parse_logprobs( + outputs=outputs, tokens=inputs, ctxlens=ctxlens + ), + cache_keys, + ): + if answer_ is not None: + res.append(answer_) + # partial caching + if cache_key is not None: + self.cache_hook.add_partial( + "loglikelihood", cache_key, answer_ + ) + pbar.update(1) + else: + inputs, ctxlens, cache_keys = self.batch_logliklehood_requests(chunked) + res = itertools.chain.from_iterable( + asyncio.run( + self.get_batched_requests( + inputs, cache_keys, generate=False, ctxlens=ctxlens + ) + ) + ) + + return re_ord.get_original(res) + + def generate_until( + self, requests: List[Instance], disable_tqdm: bool = False + ) -> List[str]: + res = [] + + def _collate_gen(_requests): + # sort by the length of the non-tokenized contexts + return -len(_requests[0]) + + # Let the API deal with tokenization + requests, all_gen_kwargs = zip(*(req.args for req in requests)) + if self.tokenized_requests: + encodings_list = self.tok_encode( + requests, add_special_tokens=self.add_bos_token + ) + else: + encodings_list = [None] * len(requests) + requests = [ + (a, b, c) for a, b, c in zip(requests, all_gen_kwargs, encodings_list) + ] + + re_ord = Collator( + requests, + sort_fn=_collate_gen, + group_by="gen_kwargs", + ) + chunked = re_ord.get_batched( + n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None + ) + if self._concurrent <= 1: + pbar = tqdm(desc="Requesting API", total=len(requests)) + for chunk in chunked: + contexts, all_gen_kwargs, encodings_list = zip(*chunk) + req = encodings_list if self.tokenized_requests else contexts + outputs = retry( + stop=stop_after_attempt(self.max_retries), + wait=wait_exponential(multiplier=0.5, min=1, max=10), + reraise=True, + )(self.model_call)( + messages=req, + generate=True, + gen_kwargs=copy.deepcopy(all_gen_kwargs[0]), + ) + for generated_text, context in zip( + self.parse_generations( + outputs=outputs, + contexts=contexts, + ), + contexts, + ): + if generated_text is not None: + res.append(generated_text) + + # partial caching + if context is not None: + self.cache_hook.add_partial( + "generate_until", + (context, all_gen_kwargs[0]), + generated_text, + ) + pbar.update(1) + else: + for chunk in chunked: + contexts, all_gen_kwargs, encodings_list = zip(*chunk) + req = encodings_list if self.tokenized_requests else contexts + results = itertools.chain.from_iterable( + asyncio.run( + self.get_batched_requests( + req, + cache_keys=[(ctx, all_gen_kwargs[0]) for ctx in contexts], + generate=True, + gen_kwargs=copy.deepcopy(all_gen_kwargs[0]), + ) + ) + ) + res.extend(results) + + return re_ord.get_original(res) + + def loglikelihood_rolling( + self, requests: List[Instance], disable_tqdm: bool = False + ) -> List[float]: + loglikelihoods = [] + + for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm): + rolling_token_windows = list( + map( + utils.make_disjoint_window, + utils.get_rolling_token_windows( + token_list=self.tok_encode(string), + prefix_token=self.prefix_token_id, + max_seq_len=self.max_length, + context_len=1, + ), + ) + ) + + # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case + rolling_token_windows = [(None,) + x for x in rolling_token_windows] + + string_nll = self._loglikelihood_tokens( + rolling_token_windows, + disable_tqdm=True, + ) + + # discard is_greedy + string_nll = [x[0] for x in string_nll] + + string_nll = sum(string_nll) + loglikelihoods.append(string_nll) + return loglikelihoods diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/huggingface.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/huggingface.py new file mode 100644 index 0000000000000000000000000000000000000000..afbcf52b1a5ac38f0bb1395cef82593a23524566 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/huggingface.py @@ -0,0 +1,1356 @@ +import copy +import os +from datetime import timedelta +from pathlib import Path +from typing import Dict, List, Literal, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import transformers +from accelerate import ( + Accelerator, + InitProcessGroupKwargs, + find_executable_batch_size, +) +from accelerate.utils import get_max_memory +from huggingface_hub import HfApi +from packaging import version +from peft import PeftModel +from peft import __version__ as PEFT_VERSION +from tqdm import tqdm +from transformers.models.auto.modeling_auto import ( + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, +) + +from lm_eval import utils +from lm_eval.api.instance import Instance +from lm_eval.api.model import TemplateLM +from lm_eval.api.registry import register_model +from lm_eval.models.utils import ( + Collator, + clear_torch_cache, + configure_pad_token, + get_dtype, + pad_and_concat, + stop_sequences_criteria, +) + + +eval_logger = utils.eval_logger + + +@register_model("hf-auto", "hf", "huggingface") +class HFLM(TemplateLM): + """ + An abstracted Huggingface model class. Enables usage with both models of + `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes. + + Supports data-parallel multi-GPU with HF Accelerate. + """ + + AUTO_MODEL_CLASS = None + _DEFAULT_MAX_LENGTH = 2048 + + def __init__( + self, + pretrained: Union[str, transformers.PreTrainedModel], + backend: Optional[Literal["default", "causal", "seq2seq"]] = "default", + # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq) + revision: Optional[str] = "main", + subfolder: Optional[str] = None, + tokenizer: Optional[ + Union[ + str, + transformers.PreTrainedTokenizer, + transformers.PreTrainedTokenizerFast, + ] + ] = None, + truncation: Optional[bool] = False, + logits_cache: bool = True, + max_length: Optional[int] = None, + device: Optional[str] = "cuda", + dtype: Optional[Union[str, torch.dtype]] = "auto", + batch_size: Optional[Union[int, str]] = 1, + max_batch_size: Optional[int] = 64, + trust_remote_code: Optional[bool] = False, + use_fast_tokenizer: Optional[bool] = True, + add_bos_token: Optional[bool] = False, + prefix_token_id: Optional[int] = None, + # arguments used for splitting a model across GPUs naively. + # only used if `parallelize=True`. + parallelize: Optional[bool] = False, + max_memory_per_gpu: Optional[Union[int, str]] = None, + max_cpu_memory: Optional[Union[int, str]] = None, + offload_folder: Optional[Union[str, os.PathLike]] = "./offload", + # PEFT, delta weights and quantization options + peft: Optional[str] = None, + delta: Optional[str] = None, + autogptq: Optional[Union[bool, str]] = False, + **kwargs, + ) -> None: + super().__init__() + + # optionally: take in an already-initialized transformers.PreTrainedModel + if not isinstance(pretrained, str): + eval_logger.warning( + "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way." + ) + assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`" + self._model = pretrained + self._device = self._model.device + self._config = self._model.config + gpus = 0 + + else: + assert isinstance(device, str) + assert isinstance(pretrained, str) + assert isinstance(batch_size, (int, str)) + + gpus = torch.cuda.device_count() + accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) + accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) + if accelerator.num_processes > 1: + self.accelerator = accelerator + + if "npu" in accelerator.device.type: + gpus = torch.npu.device_count() + + # using one process with no model parallelism + if not (parallelize or accelerator.num_processes > 1): + # use user-passed device + device_list = set( + ["cuda", "cpu"] + + [f"cuda:{i}" for i in range(gpus)] + + ["mps", "mps:0"] + + [f"npu:{i}" for i in range(gpus)] + ) + if device and device in device_list: + self._device = torch.device(device) + eval_logger.info(f"Using device '{device}'") + if device in ("mps", "mps:0") and version.parse( + torch.__version__ + ) < version.parse("2.1"): + raise RuntimeError( + f"mps requires torch >= 2.1. You have {torch.__version__}" + ) + else: + eval_logger.info("Device not specified") + eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}") + self._device = ( + torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") + ) + else: # Parallelism managed by accelerate + if device != "cuda": + eval_logger.info( + f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model." + ) + # TODO: include in warning that `load_in_8bit` etc. affect this too + self._device = ( + self.accelerator.device + if hasattr(self, "accelerator") + else torch.device(device) + ) + + revision = str(revision) # cast to string if not already one + # TODO: update this to be less of a hack once subfolder is fixed in HF + revision = revision + ("/" + subfolder if subfolder is not None else "") + + self._get_config( + pretrained, + revision=revision, + trust_remote_code=trust_remote_code, + ) + + # determine which of 'causal' and 'seq2seq' backends to use + self._get_backend( + config=self.config, backend=backend, trust_remote_code=trust_remote_code + ) + + # load tokenizer so we know tokenizer vocabulary size before loading model and PEFT + self._create_tokenizer( + pretrained, + tokenizer, + revision=revision, + trust_remote_code=trust_remote_code, + use_fast_tokenizer=use_fast_tokenizer, + ) + + # if we passed `pretrained` as a string, initialize our model now + if isinstance(pretrained, str): + self._create_model( + pretrained=pretrained, + revision=revision, + dtype=dtype, + trust_remote_code=trust_remote_code, + parallelize=parallelize, + gpus=gpus, + max_memory_per_gpu=max_memory_per_gpu, + max_cpu_memory=max_cpu_memory, + offload_folder=offload_folder, + peft=peft, + delta=delta, + autogptq=autogptq, + **kwargs, + ) + + # access self._model through self.model property outside this method + if isinstance(self.model, torch.nn.Module): + self.model.eval() + self.model.tie_weights() + + self.truncation = truncation + self.logits_cache = logits_cache + self.vocab_size = self.tokenizer.vocab_size + # select (or create) a pad token to use + self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config) + + self.add_bos_token = add_bos_token + if "gemma" in getattr(self.config, "model_type", ""): + self.add_bos_token = True + eval_logger.info( + f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it." + ) + + self._max_length = max_length + self.pretrained = pretrained + self.delta = delta + self.peft = peft + self.revision = revision + self.batch_schedule = 1 + self.batch_sizes = {} + self.max_batch_size = max_batch_size + + if str(batch_size).startswith("auto"): + batch_size = batch_size.split(":") + self.batch_size_per_gpu = batch_size[0] + self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1 + else: + self.batch_size_per_gpu = int(batch_size) + + if isinstance(pretrained, str): + if gpus >= 1 or str(self.device) == "mps": + # TODO: can remove this whole snippet except in the mps case, perhaps? + if not (parallelize or autogptq or hasattr(self, "accelerator")): + # place model onto device requested manually, + # if not using HF Accelerate or device_map + # or any other option that preloads model onto device + try: + self.model.to(self.device) + except ValueError: + eval_logger.debug( + "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore." + ) + # multigpu data-parallel support when launched with accelerate + if gpus > 1: + if accelerator.num_processes > 1: + if parallelize: + eval_logger.warning( + "You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available." + ) + elif gpus > accelerator.num_processes: + eval_logger.warning( + "WARNING: The number of total system GPUs does not match the number of spawned processes. " + "If you would like to use data parallelism, please launch the script " + "with 'accelerate launch *script*'. " + f"Current run will proceed with {accelerator.num_processes} devices." + ) + if self.accelerator.is_local_main_process: + eval_logger.info( + f"Using {gpus} devices with data parallelism" + ) + + self._device = torch.device(f"{accelerator.device}") + self.accelerator = accelerator + + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + # if we aren't launching via accelerate, ditch + self._rank = 0 + self._world_size = 1 + else: + # if a PreTrainedModel was passed into HFLM, we forgo distributed setup. + eval_logger.warning( + "Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration" + ) + self._rank = 0 + self._world_size = 1 + + self.custom_prefix_token_id = prefix_token_id + if prefix_token_id is not None: + eval_logger.info( + f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}" + ) + + def _get_accelerate_args( + self, + parallelize: bool = None, + device_map: Optional[str] = "auto", + max_memory_per_gpu: Optional[Union[int, str]] = None, + max_cpu_memory: Optional[Union[int, str]] = None, + offload_folder: Optional[str] = "./offload", + gpus: Optional[int] = None, + ) -> dict: + """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`.""" + num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) + num_machines = int(os.environ.get("WORLD_SIZE", 0)) // num_local_processes + if ( + num_machines == 0 + and hasattr(self, "accelerator") + and self.accelerator is not None + ): + eval_logger.info( + "We are not in a distributed setting for accelerate. Setting model_parallel to False." + ) + parallelize = False + + if parallelize is None: + # If parallelism is unset by the user, we automatically assign model parallelism + # if enough extra GPUs are available + max_memory_all_gpus = get_max_memory() + # We just want gpu, not cpu, max memory + if "cpu" in max_memory_all_gpus: + del max_memory_all_gpus["cpu"] + parallelize = bool(num_local_processes < len(max_memory_all_gpus)) + eval_logger.info( + f"Setting model parallel to {parallelize} since " + f"the number of local processes is {num_local_processes} " + f"and the number of GPUs is {len(max_memory_all_gpus)}" + ) + + args = {} + if parallelize: # Model parallelism will be used + max_memory = {} + if max_memory_per_gpu is not None: # Using the provided memory requirements + max_memory_per_gpu_map = { + device_idx: max_memory_per_gpu for device_idx in range(gpus) + } + else: # Estimating the possible memory requirements + max_memory_all_gpus = get_max_memory() + if "cpu" in max_memory_all_gpus: + del max_memory_all_gpus["cpu"] + if not hasattr(self, "accelerator"): + max_memory_per_gpu_map = { + k: v for k, v in max_memory_all_gpus.items() + } + else: + # use only 1 / num_processes of the GPUs if we are running under accelerate launch + max_memory_per_gpu_map = { + k: v + for k, v in max_memory_all_gpus.items() + if k % num_local_processes + == (self.accelerator.process_index % num_local_processes) + } + args["max_memory"] = max_memory_per_gpu_map + args["device_map"] = "auto" + eval_logger.info( + f"Model parallel was set to True, setting max memory per GPU to {max_memory_per_gpu_map} and device map to 'auto'" + ) + + if max_cpu_memory is not None: + max_memory["cpu"] = max_cpu_memory + + args["offload_folder"] = offload_folder + elif ( + device_map is None + ): # No model parallelism, we use the default provided device for our model + if hasattr(self, "accelerator"): + device_map = {"": f"{self.accelerator.device}"} + else: + device_map = {"": str(self.device)} + args["max_memory"] = None + args["device_map"] = device_map + eval_logger.info( + f"Model parallel was set to False, max memory was not set, and device map was set to {device_map}" + ) + else: + args["max_memory"] = None + args["device_map"] = None + eval_logger.info("Model parallel was set to False.") + + return args + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def prefix_token_id(self): + # it is used as prefix for loglikelihood + if self.custom_prefix_token_id is not None: + return self.custom_prefix_token_id + if self.tokenizer.bos_token_id is not None: + return self.tokenizer.bos_token_id + return self.tokenizer.eos_token_id + + @property + def max_length(self): + if self._max_length: # if max length manually set, return it + return self._max_length + seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx") + for attr in seqlen_config_attrs: + if hasattr(self.model.config, attr): + return getattr(self.model.config, attr) + if hasattr(self.tokenizer, "model_max_length"): + if self.tokenizer.model_max_length == 1000000000000000019884624838656: + return self._DEFAULT_MAX_LENGTH + return self.tokenizer.model_max_length + return self._DEFAULT_MAX_LENGTH + + @property + def max_gen_toks(self) -> int: + return 256 + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + @property + def tokenizer_name(self) -> str: + return self.tokenizer.name_or_path.replace("/", "__") + + @property + def chat_template(self) -> str: + if self.tokenizer.chat_template is not None: + return self.tokenizer.chat_template + return self.tokenizer.default_chat_template + + def _get_backend( + self, + config: Union[transformers.PretrainedConfig, transformers.AutoConfig], + backend: Optional[Literal["default", "causal", "seq2seq"]] = "default", + trust_remote_code: Optional[bool] = False, + ) -> None: + """ + Helper method during initialization. + Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) + model type to be used. + """ + assert backend in ["default", "causal", "seq2seq"] + + if backend != "default": + # if we've settled on non-default backend, use that manually + if backend == "causal": + self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM + elif backend == "seq2seq": + self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM + eval_logger.info( + f"Overrode HF model backend type, and using type '{backend}'" + ) + else: + # determine and use the default HF backend for this model, based on its config + metadata. + if ( + getattr(config, "model_type") + in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES + ): + # first check if model type is listed under seq2seq models, since some + # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers. + # these special cases should be treated as seq2seq models. + self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM + elif ( + getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + ): + self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM + else: + if not trust_remote_code: + eval_logger.warning( + "HF model type is neither marked as CausalLM or Seq2SeqLM. \ + This is expected if your model requires `trust_remote_code=True` but may be an error otherwise." + ) + # if model type is neither in HF transformers causal or seq2seq model registries + # then we default to AutoModelForCausalLM + self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM + + assert self.AUTO_MODEL_CLASS in [ + transformers.AutoModelForCausalLM, + transformers.AutoModelForSeq2SeqLM, + ] + return None + + def _get_config( + self, + pretrained: str, + revision: str = "main", + trust_remote_code: bool = False, + ) -> None: + self._config = transformers.AutoConfig.from_pretrained( + pretrained, + revision=revision, + trust_remote_code=trust_remote_code, + ) + + def _create_model( + self, + pretrained: str, + revision: Optional[str] = "main", + dtype: Optional[Union[str, torch.dtype]] = "auto", + trust_remote_code: Optional[bool] = False, + # arguments used for splitting a model across GPUs naively. + # only used if `parallelize=True`. + # (accelerate naive PP (device_map) options) + parallelize: Optional[bool] = False, + gpus: Optional[int] = None, + max_memory_per_gpu: Optional[Union[int, str]] = None, + max_cpu_memory: Optional[Union[int, str]] = None, + offload_folder: Optional[str] = "./offload", + # PEFT, delta weights and quantization options + peft: Optional[str] = None, + delta: Optional[str] = None, + autogptq: Optional[Union[bool, str]] = False, + **kwargs, + ) -> None: + """ + Initializes an HF or HF-compatible PreTrainedModel from scratch + inside HFLM, using the kwargs passed into self.__init__(). + + Also handles functionality such as AutoGPTQ usage and PEFT wrapping. + + For future similar extensions to AutoGPTQ that are not core to HF's ecosystem, + (such as PyTorch models that are nearly, but not quite, fully mirroring + HF's public interface relied on in this HFLM class) + please consider subclassing HFLM and overriding this and other methods as needed. + """ + + model_kwargs = kwargs if kwargs else {} + + model_kwargs.update( + self._get_accelerate_args( + parallelize=parallelize, + device_map=kwargs.get("device_map", None), + max_memory_per_gpu=max_memory_per_gpu, + max_cpu_memory=max_cpu_memory, + offload_folder=offload_folder, + gpus=gpus, + ) + ) + + if not autogptq: + if model_kwargs.get("load_in_4bit", None): + assert ( + transformers.__version__ >= "4.30.0" + ), "load_in_4bit requires transformers >= 4.30.0" + if transformers.__version__ >= "4.30.0": + if model_kwargs.get("load_in_4bit", None): + if model_kwargs.get("bnb_4bit_compute_dtype", None): + model_kwargs["bnb_4bit_compute_dtype"] = get_dtype( + model_kwargs["bnb_4bit_compute_dtype"] + ) + + self._model = self.AUTO_MODEL_CLASS.from_pretrained( + pretrained, + revision=revision, + torch_dtype=get_dtype(dtype), + trust_remote_code=trust_remote_code, + **model_kwargs, + ) + else: + try: + from auto_gptq import AutoGPTQForCausalLM + except ModuleNotFoundError: + raise Exception( + "Tried to load auto_gptq, but auto-gptq is not installed ", + "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]", + ) + + self._model = AutoGPTQForCausalLM.from_quantized( + pretrained, + trust_remote_code=trust_remote_code, + model_basename=None if autogptq is True else Path(autogptq).stem, + use_safetensors=True + if autogptq is True + else autogptq.endswith(".safetensors"), + **model_kwargs, + ) + + if peft and delta: + raise ValueError( + "Cannot use both 'peft' and 'delta' options at the same time." + ) + + if peft: + if model_kwargs.get("load_in_4bit", None): + if version.parse(PEFT_VERSION) < version.parse("0.4.0"): + raise AssertionError("load_in_4bit requires peft >= 0.4.0") + if self._model.config.vocab_size != len(self.tokenizer): + # resize model for LoRAs with added tokens + self._model.resize_token_embeddings(len(self.tokenizer)) + eval_logger.info( + f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..." + ) + self._model = PeftModel.from_pretrained( + self._model, peft, revision=revision + ) + elif delta: + if autogptq: + eval_logger.warning( + "Delta weights might trigger unexpected behavior when used with AutoGPTQ." + ) + _model_delta = self.AUTO_MODEL_CLASS.from_pretrained( + delta, + revision=revision, + torch_dtype=get_dtype(dtype), + trust_remote_code=trust_remote_code, + **model_kwargs, + ) + for name, param in self._model.state_dict().items(): + try: + param.data += _model_delta.state_dict()[name] + except KeyError: + raise KeyError(f"Delta model is missing weights for layer: {name}") + except Exception as e: + raise RuntimeError( + f"Failed to add delta weights to layer {name}. Error: {e}" + ) + + del _model_delta + + return None + + def _create_tokenizer( + self, + pretrained: Union[str, transformers.PreTrainedModel], + tokenizer: Optional[ + Union[ + str, + transformers.PreTrainedTokenizer, + transformers.PreTrainedTokenizerFast, + ] + ], + revision: Optional[str] = "main", + trust_remote_code: Optional[bool] = False, + use_fast_tokenizer: Optional[bool] = True, + ) -> None: + """ + Helper method during initialization. + + Create a tokenizer object corresponding to the correct + tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed. + """ + + if tokenizer: + if isinstance(tokenizer, str): + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer, + revision=revision, + trust_remote_code=trust_remote_code, + use_fast=use_fast_tokenizer, + ) + else: + assert isinstance( + tokenizer, transformers.PreTrainedTokenizer + ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast) + self.tokenizer = tokenizer + else: + # Get tokenizer based on 'pretrained' + if isinstance(pretrained, str): + model_name = pretrained + else: + # get the HF hub name via accessor on model + model_name = self.model.name_or_path + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + model_name, + revision=revision, + trust_remote_code=trust_remote_code, + use_fast=use_fast_tokenizer, + ) + return None + + def _detect_batch_size(self, requests=None, pos: int = 0): + if requests: + _, context_enc, continuation_enc = requests[pos] + max_length = len( + (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1] + ) + max_context_enc = len(context_enc[-(self.max_length + 1) :]) + max_cont_enc = len(continuation_enc[-(self.max_length + 1) :]) + else: + max_length = self.max_length + max_context_enc = max_length + max_cont_enc = max_length + + # if OOM, then halves batch_size and tries again + @find_executable_batch_size(starting_batch_size=self.max_batch_size) + def forward_batch(batch_size): + if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: + length = max(max_context_enc, max_cont_enc) + batched_conts = torch.ones( + (batch_size, length), device=self.device + ).long() + test_batch = torch.ones((batch_size, length), device=self.device).long() + call_kwargs = { + "attn_mask": test_batch, + "labels": batched_conts, + } + else: + call_kwargs = {} + test_batch = torch.ones( + (batch_size, max_length), device=self.device + ).long() + for _ in range(5): + out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1) # noqa: F841 + + return batch_size + + try: + batch_size = forward_batch() + except RuntimeError as e: + if "No executable batch size found" in str(e): + batch_size = 1 + else: + raise + + if self.world_size > 1: + # if multi-GPU, always take minimum over all selected batch sizes + max_rnk_bs = torch.tensor([batch_size], device=self.device) + gathered = ( + self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist() + ) + batch_size = min(gathered) + clear_torch_cache() + return batch_size + + clear_torch_cache() + return batch_size + + def tok_encode( + self, string: str, left_truncate_len=None, add_special_tokens=None + ) -> List[int]: + """ """ + # default for None - empty dict, use predefined tokenizer param + # used for all models except for CausalLM or predefined value + special_tokens_kwargs = {} + + # by default for CausalLM - false or self.add_bos_token is set + if add_special_tokens is None: + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: + special_tokens_kwargs = { + "add_special_tokens": False or self.add_bos_token + } + # otherwise the method explicitly defines the value + else: + special_tokens_kwargs = {"add_special_tokens": add_special_tokens} + + encoding = self.tokenizer.encode(string, **special_tokens_kwargs) + + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + + return encoding + + def tok_batch_encode( + self, + strings: List[str], + padding_side: str = "left", + left_truncate_len: int = None, + truncation: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode. + old_padding_side = self.tokenizer.padding_side + self.tokenizer.padding_side = padding_side + + add_special_tokens = {} + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: + add_special_tokens = {"add_special_tokens": False or self.add_bos_token} + + encoding = self.tokenizer( + strings, + truncation=truncation, + padding="longest", + return_tensors="pt", + **add_special_tokens, + ) + if left_truncate_len: + encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:] + encoding["attention_mask"] = encoding["attention_mask"][ + :, -left_truncate_len: + ] + self.tokenizer.padding_side = old_padding_side + + return encoding["input_ids"], encoding["attention_mask"] + + def tok_decode(self, tokens, skip_special_tokens=True): + return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens) + + def _model_call(self, inps, attn_mask=None, labels=None): + """ + :param inps: torch.Tensor + A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape + [batch, sequence_ctx]. the size of sequence may vary from call to call + :param attn_mask: torch.Tensor, optional + A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed + (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM + :param labels: torch.Tensor, optional + A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed + (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM + :return + A torch tensor of shape [batch, sequence, vocab] with the + logits returned from the model's decoder + """ + with torch.no_grad(): + if attn_mask is not None or labels is not None: + assert attn_mask is not None and labels is not None + assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM + return self.model( + input_ids=inps, attention_mask=attn_mask, labels=labels + ).logits + else: + assert self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM + return self.model(inps).logits + + def _model_generate(self, context, max_length, stop, **generation_kwargs): + # temperature = 0.0 if not set + # if do_sample is false and temp==0.0: + # remove temperature, as do_sample=False takes care of this + # and we don't want a warning from HF + generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0) + do_sample = generation_kwargs.get("do_sample", None) + + # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies + if generation_kwargs.get("temperature") == 0.0 and do_sample is None: + generation_kwargs["do_sample"] = do_sample = False + + if do_sample is False and generation_kwargs.get("temperature") == 0.0: + generation_kwargs.pop("temperature") + # build stopping criteria + stopping_criteria = stop_sequences_criteria( + self.tokenizer, stop, context.shape[1], context.shape[0] + ) + return self.model.generate( + input_ids=context, + max_length=max_length, + stopping_criteria=stopping_criteria, + pad_token_id=self.tokenizer.pad_token_id, + use_cache=True, + **generation_kwargs, + ) + + def _select_cont_toks( + self, logits: torch.Tensor, contlen: int = None, inplen: int = None + ) -> torch.Tensor: + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: + assert ( + contlen and inplen + ), "Must pass input len and cont. len to select scored logits for causal LM" + # discard right-padding. + # also discard the input/context tokens. we'll only score continuations. + logits = logits[inplen - contlen : inplen] + elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: + assert ( + contlen and not inplen + ), "Selecting scored logits for Seq2SeqLM requires only cont. len" + # only discard right-padding. + # the logits input to this fn only contain decoder-side tokens. + logits = logits[:contlen] + + return logits + + def loglikelihood_rolling( + self, requests: List[Instance], disable_tqdm: bool = False + ) -> List[float]: + loglikelihoods = [] + + adaptive_batch_size = None + if self.batch_size == "auto": + # using rolling window with maximum context + print("Passed argument batch_size = auto. Detecting largest batch size") + batch_size = self._detect_batch_size() + print(f"Determined Largest batch size: {batch_size}") + adaptive_batch_size = batch_size + + for (string,) in tqdm( + [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0)) + ): + rolling_token_windows = list( + map( + utils.make_disjoint_window, + utils.get_rolling_token_windows( + token_list=self.tok_encode(string), + prefix_token=self.prefix_token_id, + max_seq_len=self.max_length, + context_len=1, + ), + ) + ) + + # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case + rolling_token_windows = [(None,) + x for x in rolling_token_windows] + + pad_amnt = 0 + if self.world_size > 1: + # We pad out the external document-level iterator so the inner iterator doesn't hang + mytensor = torch.tensor(len(rolling_token_windows), device=self.device) + gathered = ( + self.accelerator.gather(mytensor).cpu().detach().numpy().tolist() + ) + + pad_amnt = max(gathered) - gathered[self.rank] + if pad_amnt > 0: + rolling_token_windows += pad_amnt * [rolling_token_windows[0]] + + string_nll = self._loglikelihood_tokens( + requests=rolling_token_windows, + disable_tqdm=True, + override_bs=adaptive_batch_size, + ) + + if (self.world_size > 1) and (pad_amnt > 0): + string_nll = [x[0] for x in string_nll[:-pad_amnt]] + else: + # discard is_greedy + string_nll = [x[0] for x in string_nll] + + string_nll = sum(string_nll) + loglikelihoods.append(string_nll) + + return loglikelihoods + + def _batch_scheduler(self, pos, n_reordered_requests): + sched = pos // int(len(n_reordered_requests) / self.batch_schedule) + if sched in self.batch_sizes: + return self.batch_sizes[sched] + if (len(self.batch_sizes) > 1) and ( + self.batch_sizes[sched - 1] == self.max_batch_size + ): + # if previous batch size is already maximal, skip recomputation + self.batch_sizes[sched] = self.max_batch_size + return self.batch_sizes[sched] + print( + f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size" + ) + self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos) + print(f"Determined largest batch size: {self.batch_sizes[sched]}") + return self.batch_sizes[sched] + + def _loglikelihood_tokens( + self, + requests: List[Tuple[Tuple[str, str], List[int], List[int]]], + disable_tqdm: bool = False, + override_bs: int = None, + ) -> List[Tuple[float, bool]]: + # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context + res = [] + + def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]): + """Defines the key for the sorted method""" + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + + toks = req[1] + req[2] + return -len(toks), tuple(toks) + + def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]): + """Defines the key to group and lookup one-token continuations""" + # Use with group_by="contexts" (optional)" + # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations. + # speeds up some multiple-choice tasks proportionally to the number of choices. + # groups requests by context+continuation[:-1] and infer on one request/group. + return req[-2] + req[-1][:-1] + + re_ord = Collator( + requests, + sort_fn=_collate, + group_by="contexts" + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM + and self.logits_cache + else None, + group_fn=_lookup_one_token_cont, + ) + + # automatic (variable) batch size detection for vectorization + # pull longest context sample from request + n_reordered_requests = len(re_ord) + batch_size = ( + self.batch_size + if self.batch_size != "auto" + else override_bs + if override_bs is not None + else 0 + ) + batch_fn = ( + self._batch_scheduler + if self.batch_size == "auto" + and n_reordered_requests > 0 + and not override_bs + else None + ) + + chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn) + pbar = tqdm( + total=len(requests), + disable=(disable_tqdm or (self.rank != 0)), + desc="Running loglikelihood requests", + ) + for chunk in chunks: + inps = [] + cont_toks_list = [] + inplens = [] + + conts = [] + encoder_attns = [] + + padding_len_inp = None + padding_len_cont = None + # because vectorizing is annoying, we first convert each (context, continuation) pair to padded + # tensors, then we pack them together into a batch, call the model, and then pick it all apart + # again because vectorizing is annoying + + for _, context_enc, continuation_enc in chunk: + # sanity check + assert len(context_enc) > 0 + assert len(continuation_enc) > 0 + assert len(continuation_enc) <= self.max_length + + # how this all works (illustrated on a causal decoder-only setup): + # CTX CONT + # inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] + # model \ \ + # logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the + # cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice + + # when too long to fit in context, truncate from the left + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: + inp = torch.tensor( + (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1], + dtype=torch.long, + device=self.device, + ) + (inplen,) = inp.shape + elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: + inp = torch.tensor( + (context_enc)[-self.max_length :], + dtype=torch.long, + device=self.device, + ) + (inplen,) = inp.shape + + # build encoder attn masks + encoder_attns.append(torch.ones_like(inp)) + + cont = torch.tensor( + (continuation_enc)[-self.max_length :], + # TODO: left-shift these? + # TODO: our code assumes we never end up truncating conts for either model type + dtype=torch.long, + device=self.device, + ) + (contlen,) = cont.shape + + conts.append(cont) + + padding_len_cont = ( + max(padding_len_cont, contlen) + if padding_len_cont is not None + else contlen + ) + + padding_len_inp = ( + max(padding_len_inp, inplen) + if padding_len_inp is not None + else inplen + ) + + inps.append(inp) # [1, inp_length] + cont_toks_list.append(continuation_enc) + inplens.append(inplen) + + # create encoder attn mask and batched conts, if seq2seq + call_kwargs = {} + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: + batched_inps = pad_and_concat( + padding_len_inp, inps, padding_side="right" + ) # [batch, padding_len_inp] + elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: + # TODO: left-pad encoder inps and mask? + batched_inps = pad_and_concat( + padding_len_inp, inps + ) # [batch, padding_len_inp] + batched_conts = pad_and_concat( + padding_len_cont, conts + ) # [batch, padding_len_cont] + batched_encoder_mask = pad_and_concat( + padding_len_inp, encoder_attns + ) # [batch, padding_len_inp] + call_kwargs = { + "attn_mask": batched_encoder_mask, + "labels": batched_conts, + } + + multi_logits = F.log_softmax( + self._model_call(batched_inps, **call_kwargs), dim=-1 + ) # [batch, padding_length (inp or cont), vocab] + + for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip( + chunk, multi_logits, inplens, cont_toks_list + ): + # Slice to original seq length + contlen = len(cont_toks) + # take only logits in the continuation + # (discard context toks if decoder-only ; discard right-padding) + # also discards + checks for "virtual tokens" in the causal LM's input window + # from prompt/prefix tuning tokens, if applicable + ctx_len = ( + inplen + (logits.shape[0] - padding_len_inp) + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM + else None + ) + logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len) + logits = logits.unsqueeze(0) # [1, seq, vocab] + + # Check if per-token argmax is exactly equal to continuation + greedy_tokens = logits.argmax(dim=-1) + + # check for one-token continuation cache hits. + # noop in case group_by != "contexts" or no cache hit and returns the + # original args. Otherwise, expands the logits batch dimension and yields each + # batch along with matching continuation tokens and prompt strings. + # logits -> [1, seq, vocab] + for request_str, cont_toks, logits in re_ord.get_cache( + req_str=request_str, + cxt_toks=ctx_tokens, + cont_toks=cont_toks, + logits=logits, + ): + cont_toks = torch.tensor( + cont_toks, dtype=torch.long, device=self.device + ).unsqueeze(0) # [1, seq] + max_equal = (greedy_tokens == cont_toks).all() + + # Obtain log-probs at the corresponding continuation token indices + # last_token_slice = logits[:, -1, :].squeeze(0).tolist() + logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze( + -1 + ) # [1, seq] + + # Answer: (log prob, is-exact-match) + answer = (float(logits.sum()), bool(max_equal)) + + res.append(answer) + + self.cache_hook.add_partial("loglikelihood", request_str, answer) + pbar.update(1) + + pbar.close() + + return re_ord.get_original(res) + + def generate_until( + self, requests: List[Instance], disable_tqdm: bool = False + ) -> List[str]: + res = [] + + def _collate(req: Tuple[str, dict]): + """Defines the key for the sorted method""" + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(req[0]) + return -len(toks), req[0] + + pbar = tqdm( + total=len(requests), + disable=(disable_tqdm or (self.rank != 0)), + desc="Running generate_until requests", + ) + adaptive_batch_size = None + if self.batch_size == "auto": + # using rolling window with maximum context + print("Passed argument batch_size = auto. Detecting largest batch size") + batch_size = self._detect_batch_size() + print(f"Determined Largest batch size: {batch_size}") + adaptive_batch_size = batch_size + # for each different set of kwargs, we execute all requests, by batch. + batch_size = ( + self.batch_size + if self.batch_size != "auto" + else adaptive_batch_size + if adaptive_batch_size is not None + else 0 + ) + batch_fn = ( + self._batch_scheduler + if self.batch_size == "auto" and not adaptive_batch_size + else None + ) + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + # group_fn=lambda x: x[1] -> x=(context, gen_kwargs) + re_ords = Collator( + [reg.args for reg in requests], + sort_fn=_collate, + group_by="gen_kwargs", + group_fn=lambda x: x[1], + ) + chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn) + for chunk in chunks: + contexts, all_gen_kwargs = zip(*chunk) + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + gen_kwargs = all_gen_kwargs[0] + # unpack our keyword arguments. + until = None + if isinstance(gen_kwargs, dict): + kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1 + if "until" in kwargs.keys(): + until = kwargs.pop("until") + if isinstance(until, str): + until = [until] + elif not isinstance(until, list): + raise ValueError( + f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}" + ) + else: + raise ValueError( + f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}" + ) + # add EOS token to stop sequences + eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False) + if not until: + until = [eos] + else: + until.append(eos) + if "max_gen_toks" in kwargs.keys(): + max_gen_toks = kwargs.pop("max_gen_toks") + else: + max_gen_toks = self.max_gen_toks + + # set the max length in tokens of inputs ("context_enc") + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: + # max len for inputs = max length, minus room to generate the max new tokens + max_ctx_len = self.max_length - max_gen_toks + elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: + # max len for inputs = encoder's whole max_length + max_ctx_len = self.max_length + + # encode, pad, and truncate contexts for this batch + context_enc, attn_masks = self.tok_batch_encode( + contexts, + left_truncate_len=max_ctx_len, + truncation=self.truncation, + ) + context_enc = context_enc.to(self.device) + attn_masks = attn_masks.to(self.device) + + if "max_length" not in kwargs: + kwargs["max_length"] = context_enc.shape[1] + max_gen_toks + + # perform batched generation + cont = self._model_generate( + context=context_enc, + attention_mask=attn_masks, + stop=until, + **kwargs, + ) + + cont_toks_list = cont.tolist() + for cont_toks, context in zip(cont_toks_list, contexts): + # discard context + left-padding toks if using causal decoder-only LM + if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: + cont_toks = cont_toks[context_enc.shape[1] :] + + s = self.tok_decode(cont_toks) + + # use secondary stop seqs to cut off should-have-been-stopped content post-hoc + for term in until: + if len(term) > 0: + # ignore '' separator, + # for seq2seq case where self.tok_decode(self.eot_token_id) = '' + s = s.split(term)[0] + + res.append(s) + + self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s) + pbar.update(1) + # reorder this group of results back to original unsorted form + res = re_ords.get_original(res) + + pbar.close() + + return res + + def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str: + """ + Method to apply a chat template to a list of chat history between user and model. + """ + return self.tokenizer.apply_chat_template( + chat_history, tokenize=False, add_generation_prompt=True + ) + + def get_model_info(self) -> dict: + """ + Method to get Hugging Face model information for experiment reproducibility. + """ + + def get_model_num_params(model) -> int: + if hasattr(model, "num_parameters"): + return model.num_parameters() + if hasattr(model, "parameters"): + return sum(p.numel() for p in model.parameters()) + else: + return -1 + + def get_model_dtype(model) -> str: + if hasattr(model, "dtype"): + return model.dtype + else: + return "" + + def get_model_sha(pretrained: str, revision: str) -> str: + try: + model_info = HfApi().model_info(repo_id=pretrained, revision=revision) + return model_info.sha + except Exception as e: + eval_logger.warn( + f"Failed to get model SHA for {pretrained} at revision {revision}. Error: {e}" + ) + return "" + + model_info = { + "model_num_parameters": get_model_num_params(self._model), + "model_dtype": get_model_dtype(self._model), + "model_revision": self.revision, + "model_sha": get_model_sha(self.pretrained, self.revision), + } + if self.peft: + model_info["peft_sha"] = get_model_sha(self.peft, self.revision) + if self.delta: + model_info["delta_sha"] = get_model_sha(self.delta, self.revision) + return model_info diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/nemo_lm.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/nemo_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..cb1aca1eec18a05725ffb29e15f633078cab699b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/nemo_lm.py @@ -0,0 +1,537 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import pathlib +from copy import deepcopy +from typing import List, Literal + +import filelock +import numpy as np +import torch +from tqdm import tqdm + +from lm_eval.api.instance import Instance +from lm_eval.api.model import LM +from lm_eval.api.registry import register_model +from lm_eval.models.utils import Collator +from lm_eval.utils import ( + eval_logger, + get_rolling_token_windows, + make_disjoint_window, + simple_parse_args_string, +) + + +def _patch_pretrained_cfg( + pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size +): + try: + import omegaconf + except ModuleNotFoundError: + raise Exception( + "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed" + "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, " + "or installing nemo following https://github.com/NVIDIA/NeMo.", + ) + + omegaconf.OmegaConf.set_struct(pretrained_cfg, True) + with omegaconf.open_dict(pretrained_cfg): + attributes_to_update = { + "sequence_parallel": False, + "activations_checkpoint_granularity": None, + "activations_checkpoint_method": None, + "precision": trainer.precision, + "global_batch_size": None, + "tensor_model_parallel_size": tensor_model_parallel_size, + "pipeline_model_parallel_size": pipeline_model_parallel_size, + "apply_rope_fusion": False, + } + for name, value in attributes_to_update.items(): + if hasattr(pretrained_cfg, name): + pretrained_cfg[name] = value + return pretrained_cfg + + +def _get_target_from_class(target_class) -> str: + return f"{target_class.__module__}.{target_class.__name__}" + + +def load_model( + model_path: str, + trainer, + tensor_model_parallel_size: int, + pipeline_model_parallel_size: int, +) -> torch.nn.Module: + try: + from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import ( + MegatronGPTModel, + ) + from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector + except ModuleNotFoundError: + raise Exception( + "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed" + "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, " + "or installing nemo following https://github.com/NVIDIA/NeMo.", + ) + model_path = pathlib.Path(model_path) + + save_restore_connector = NLPSaveRestoreConnector() + if model_path.is_dir(): + save_restore_connector.model_extracted_dir = model_path.as_posix() + pretrained_cfg = save_restore_connector.restore_from( + None, model_path.as_posix(), return_config=True, trainer=trainer + ) + if not hasattr(pretrained_cfg, "target"): + pretrained_cfg["target"] = _get_target_from_class(MegatronGPTModel) + + pretrained_cfg = _patch_pretrained_cfg( + pretrained_cfg, + trainer, + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, + ) + + model_to_load_path = model_path + override_config = pretrained_cfg + + module_name, class_name = override_config.target.rsplit(".", 1) + model_class = getattr(importlib.import_module(module_name), class_name) + + # monkeypatch _build_tokenizer method to be process-safe + tokenizer_lock = filelock.FileLock(f"/tmp/{model_path.name}.tokenizer.lock") + + def _synced_build_tokenizer(self): + with tokenizer_lock: + self._original_build_tokenizer() + + model_class._original_build_tokenizer = model_class._build_tokenizer + model_class._build_tokenizer = _synced_build_tokenizer + + model = model_class.restore_from( + restore_path=model_to_load_path.as_posix(), + trainer=trainer, + override_config_path=override_config, + save_restore_connector=save_restore_connector, + map_location=f"cuda:{trainer.local_rank}", + ) + + model.freeze() + model.training = False + try: + # Have to turn off activations_checkpoint_method for inference + model.model.language_model.encoder.activations_checkpoint_method = None + except AttributeError: + pass + return model + + +def setup_distributed_environment(trainer): + try: + from nemo.utils.app_state import AppState + except ModuleNotFoundError: + raise Exception( + "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed" + "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, " + "or installing nemo following https://github.com/NVIDIA/NeMo.", + ) + + def dummy(): + return + + if trainer.strategy.launcher is not None: + trainer.strategy.launcher.launch(dummy, trainer=trainer) + trainer.strategy.setup_environment() + + app_state = AppState() + + return app_state + + +@register_model("nemo_lm") +class NeMoLM(LM): + def __init__( + self, + path: str, + max_length: int = 4096, + batch_size: int = 1, + max_gen_toks: int = 256, + devices: int = 1, + num_nodes: int = 1, + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + precision: Literal[ + "16-mixed", + "bf16-mixed", + "32-true", + "64-true", + 64, + 32, + 16, + "64", + "32", + "16", + "bf16", + ] = "bf16", + **kwargs, + ): + try: + from nemo.collections.nlp.modules.common.text_generation_utils import ( + generate, + ) + from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy + from pytorch_lightning.trainer.trainer import Trainer + + self.generate = generate + except ModuleNotFoundError: + raise Exception( + "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed" + "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, " + "or installing nemo following https://github.com/NVIDIA/NeMo.", + ) + + super().__init__() + + if ( + tensor_model_parallel_size == 1 + and pipeline_model_parallel_size == 1 + and devices > 1 + ): + eval_logger.info( + f"The number of data replicas for evaluation is {devices}." + ) + eval_logger.info(f"The total number of devices is {devices}.") + eval_logger.info( + "No tensor parallelism or pipeline parallelism is applied." + ) + + elif tensor_model_parallel_size * pipeline_model_parallel_size == devices: + eval_logger.info( + f"Setting tensor parallelism to {tensor_model_parallel_size} and pipeline parallelism to {pipeline_model_parallel_size}." + ) + eval_logger.info(f"The total number of devices is {devices}.") + eval_logger.info("No data parallelism is applied.") + + else: + raise ValueError( + "Please set the product of tensor_model_parallel_size and pipeline_model_parallel_size" + "equal to the specified number of devices." + ) + + if num_nodes > 1: + raise ValueError( + "A number of nodes greater than 1 is not supported yet. Please set num_nodes as 1." + ) + + trainer = Trainer( + strategy=NLPDDPStrategy(), + devices=devices, + accelerator="gpu", + num_nodes=num_nodes, + precision=precision, + logger=False, + enable_checkpointing=False, + use_distributed_sampler=False, + ) + # Modify the following flags only for data replication + if ( + tensor_model_parallel_size == 1 + and pipeline_model_parallel_size == 1 + and devices > 1 + ): + self._device = torch.device(f"cuda:{trainer.global_rank}") + self._rank = trainer.global_rank + self._world_size = trainer.world_size + self.model = load_model( + path, + trainer, + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, + ).cuda() + self.tokenizer = self.model.tokenizer + self.app_state = setup_distributed_environment(trainer) + + self._max_length = max_length + self._batch_size = int(batch_size) + self._max_gen_toks = max_gen_toks + + @classmethod + def create_from_arg_string(cls, arg_string, additional_config=None): + args = simple_parse_args_string(arg_string) + if additional_config: + args["batch_size"] = additional_config.get("batch_size", 1) + + return cls(**args) + + @property + def eot_token_id(self): + try: + return self.tokenizer.eos_id + except AttributeError: + return None + + @property + def max_length(self): + return self._max_length + + @property + def max_gen_toks(self): + return self._max_gen_toks + + @property + def batch_size(self): + return self._batch_size + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + @property + def accelerator(self): + return self._Accelerator(self.world_size) + + class _Accelerator: + def __init__(self, world_size): + self.world_size = world_size + + def wait_for_everyone(self): + torch.distributed.barrier() + + def gather(self, local_tensor): + gathered_tensors = [ + torch.zeros(1, dtype=local_tensor.dtype).cuda() + for _ in range(self.world_size) + ] + torch.distributed.all_gather(gathered_tensors, local_tensor) + return torch.cat(gathered_tensors) + + def tok_encode(self, string: str): + return self.tokenizer.text_to_ids(string) + + def tok_decode(self, tokens): + return self.tokenizer.ids_to_text(tokens) + + def _encode_pair(self, context, continuation): + n_spaces = len(context) - len(context.rstrip()) + if n_spaces > 0: + continuation = context[-n_spaces:] + continuation + context = context[:-n_spaces] + whole_enc = self.tok_encode(context + continuation) + context_enc = self.tok_encode(context) + context_enc_len = len(context_enc) + continuation_enc = whole_enc[context_enc_len:] + return context_enc, continuation_enc + + def loglikelihood(self, requests): + new_reqs = [] + for context, continuation in [req.args for req in requests]: + if context == "": + # end of text as context + context_enc, continuation_enc = ( + [self.eot_token_id], + self.tok_encode(continuation), + ) + else: + context_enc, continuation_enc = self._encode_pair(context, continuation) + + new_reqs.append(((context, continuation), context_enc, continuation_enc)) + + return self._loglikelihood_tokens(new_reqs) + + def loglikelihood_rolling( + self, requests: List[Instance], disable_tqdm: bool = False + ) -> List[float]: + loglikelihoods = [] + + for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm): + rolling_token_windows = list( + map( + make_disjoint_window, + get_rolling_token_windows( + token_list=self.tok_encode(string), + prefix_token=self.eot_token_id, + max_seq_len=self.max_length - 1, + context_len=1, + ), + ) + ) + + rolling_token_windows = [(None,) + x for x in rolling_token_windows] + + string_nll = self._loglikelihood_tokens( + rolling_token_windows, + ) + + # discard is_greedy + string_nll = [x[0] for x in string_nll] + + string_nll = sum(string_nll) + loglikelihoods.append(string_nll) + return loglikelihoods + + def _loglikelihood_tokens(self, requests, disable_tqdm=False): + res = [] + + def _collate(x): + toks = x[1] + x[2] + return -len(toks), tuple(toks) + + re_ord = Collator(requests, sort_fn=_collate) + chunks = re_ord.get_batched(n=self.batch_size, batch_fn=None) + pbar = tqdm( + total=len(requests), + disable=(disable_tqdm or (self.rank != 0)), + desc="Running loglikelihood requests", + ) + for chunk in chunks: + inps = [] + ctxlens = [] + contlens = [] + + for _, context_enc, continuation_enc in chunk: + # Leave one token for generation. Tokens_to_generate = 0 breaks NeMo. + inp = (context_enc + continuation_enc)[-(self.max_length - 1) :] + + ctxlen = len(context_enc) - max( + 0, len(context_enc) + len(continuation_enc) - (self.max_length - 1) + ) + ctxlens.append(ctxlen) + contlens.append(len(continuation_enc)) + + inps.append(self.tok_decode(inp)) + + output = self.generate( + self.model, + inputs=inps, + tokens_to_generate=1, + min_tokens_to_generate=1, + compute_logprob=True, + all_probs=True, + ) + + batch_token_ids = np.asarray(output["token_ids"])[:, :-1] + batch_logprobs = output["logprob"][:, :-1] + batch_full_logprob = output["full_logprob"][:, :-1, :] + + # Compute greedy tokens for entire batch rather than calling it with proper ctxlen for each sample. + # Additional tokens for each sample will be trimmed later. + min_ctxlen = min(ctxlens) + + # Use min_ctxlen-1 instead of min_ctxlen since full_logprobs are not returns for the first token. + batch_greedy_tokens = ( + torch.argmax(batch_full_logprob[:, min_ctxlen - 1 :, :], -1) + .cpu() + .numpy() + ) + + for token_ids, greedy_tokens, logprobs, ctxlen, contlen, ( + cache_key, + _, + _, + ) in zip( + batch_token_ids, + batch_greedy_tokens, + batch_logprobs, + ctxlens, + contlens, + chunk, + ): + # Trim at contlen since shorter contexts in a batch will have more than one token generated. + # Use ctxlen-1 instead of ctxlen same as for full_logprob in batch_greedy_tokens calculation + logprobs = (logprobs[ctxlen - 1 :])[:contlen] + logprob = sum(logprobs).tolist() + + continuation_tokens = (token_ids[ctxlen:])[:contlen] + len_diff = ctxlen - min_ctxlen + is_greedy = continuation_tokens == (greedy_tokens[len_diff:])[:contlen] + if not isinstance(is_greedy, bool): + is_greedy = is_greedy.all() + answer = (logprob, is_greedy) + + if cache_key is not None: + self.cache_hook.add_partial("loglikelihood", cache_key, answer) + + res.append(answer) + pbar.update(1) + + pbar.close() + + return re_ord.get_original(res) + + def generate_until(self, requests): + if not requests: + return [] + res = [] + + def get_until(req_args): + until = req_args.get("until", []) + until = deepcopy(until) # prevent from modifying req_args for cache_key + if self.tokenizer.ids_to_tokens([self.eot_token_id])[0] not in until: + until.append(self.tokenizer.ids_to_tokens([self.eot_token_id])[0]) + return until + + def _collate(x): + toks = self.tok_encode(x[0]) + return len(toks), x[0] + + re_ords = Collator( + [reg.args for reg in requests], sort_fn=_collate, group_by="gen_kwargs" + ) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + for chunk in chunks: + contexts, all_gen_kwargs = zip(*chunk) + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + req_args = all_gen_kwargs[0] + # unpack our keyword arguments. + until = get_until(req_args) + max_gen_toks = req_args.get("max_gen_toks", self.max_gen_toks) + + remaining_length = self.max_length - max_gen_toks + contexts = [] + for context, _ in chunk: + encoded_context = self.tok_encode(context) + encoded_context = encoded_context[-remaining_length:] + contexts.append(self.tok_decode(encoded_context)) + + output = self.generate( + self.model, + inputs=contexts, + tokens_to_generate=max_gen_toks, + end_strings=until, + greedy=True, + ) + + answers = output["sentences"] + + continuations = [] + for context, answer in zip(contexts, answers): + continuations.append(answer[len(context) :]) + + for term in until: + continuations = [answer.split(term)[0] for answer in continuations] + + for request, answer in zip(chunk, continuations): + self.cache_hook.add_partial("greedy_until", request, answer) + res.append(answer) + + return re_ords.get_original(res) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/neuralmagic.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/neuralmagic.py new file mode 100644 index 0000000000000000000000000000000000000000..7c16b06d50b2b8117cf0b6d6b33d9d4a2b681923 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/neuralmagic.py @@ -0,0 +1,426 @@ +import copy +from typing import List, Optional, Tuple, Union + +import numpy +import transformers +from tqdm import tqdm + +import lm_eval.models.utils +from lm_eval import utils +from lm_eval.api.instance import Instance +from lm_eval.api.model import LM +from lm_eval.api.registry import register_model +from lm_eval.models.huggingface import HFLM + + +eval_logger = utils.eval_logger + + +@register_model("sparseml") +class SparseMLLM(HFLM): + """ + SparseML is an open-source model optimization toolkit that enables you to create + inference-optimized sparse models using pruning, quantization, and distillation + algorithms. Models optimized with SparseML can then be exported to the ONNX format and + deployed with DeepSparse for GPU-class performance on CPU hardware. + + This class is a wrapper around the HuggingFace LM class to enable SparseML + integration with the lm-evaluation-harness. + """ + + def _create_model( + self, + pretrained: str, + revision: Optional[str] = "main", + dtype: Optional[str] = "auto", + trust_remote_code: Optional[bool] = False, + **kwargs, + ) -> None: + try: + from sparseml.transformers import SparseAutoModelForCausalLM + except ModuleNotFoundError: + raise Exception( + "Package `sparseml` is not installed. " + "Please install it via `pip install sparseml[transformers]`" + ) + + model_kwargs = kwargs if kwargs else {} + + if "device_map" not in model_kwargs: + # set a device_map to initialize model on the right GPU. + # this is needed because it seems that the default behavior + # for quantized models now seems to be device_map="auto" + # which breaks data-parallel mode. + if hasattr(self, "accelerator"): + model_kwargs.update( + {"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}} + ) + else: + model_kwargs.update({"device_map": {"": str(self.device)}}) + + relevant_kwarg_names = [ + "offload_folder", + "device_map", + ] + relevant_kwargs = { + k: v for k, v in model_kwargs.items() if k in relevant_kwarg_names + } + + # Log the difference between model_kwargs and relevant_kwargs so we can see + # what is being ignored + ignored_kwargs = {} + for k, v in model_kwargs.items(): + if k not in relevant_kwargs.keys(): + ignored_kwargs[k] = v + eval_logger.warning( + f"The sparseml integration is ignoring the following kwargs that are specified: {ignored_kwargs}" + ) + + model = SparseAutoModelForCausalLM.from_pretrained( + pretrained, + revision=revision, + torch_dtype=lm_eval.models.utils.get_dtype(dtype), + trust_remote_code=trust_remote_code, + **relevant_kwargs, + ) + self._model = model + + def _get_config(self, pretrained: str, **kwargs) -> None: + try: + from sparseml.transformers import SparseAutoConfig + except ModuleNotFoundError: + raise Exception( + "Package `sparseml` is not installed. " + "Please install it via `pip install sparseml[transformers]`" + ) + + self._config = SparseAutoConfig.from_pretrained( + pretrained_model_name_or_path=pretrained, **kwargs + ) + + def _create_tokenizer( + self, + pretrained: Union[str, transformers.PreTrainedModel], + tokenizer: Optional[ + Union[ + str, + transformers.PreTrainedTokenizer, + transformers.PreTrainedTokenizerFast, + ] + ], + **kwargs, + ) -> None: + try: + from sparseml.transformers import SparseAutoTokenizer + except ModuleNotFoundError: + raise Exception( + "Package `sparseml` is not installed. " + "Please install it via `pip install sparseml[transformers]`" + ) + + if tokenizer: + if isinstance(tokenizer, str): + self.tokenizer = SparseAutoTokenizer.from_pretrained( + tokenizer, + **kwargs, + ) + else: + assert isinstance( + tokenizer, transformers.PreTrainedTokenizer + ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast) + self.tokenizer = tokenizer + else: + # Get tokenizer based on 'pretrained' + if isinstance(pretrained, str): + model_name = pretrained + else: + # get the HF hub name via accessor on model + model_name = self.model.name_or_path + self.tokenizer = SparseAutoTokenizer.from_pretrained( + model_name, + **kwargs, + ) + return None + + +@register_model("deepsparse") +class DeepSparseLM(LM): + """ + Wrapper around DeepSparse, a sparsity-aware deep learning + inference runtime for CPUs, to make it compatible with the + lm-evaluation-harness. + """ + + _DEFAULT_MAX_LENGTH = 2048 + + def __init__( + self, + pretrained: str, + tokenizer: Optional[ + Union[ + str, + transformers.PreTrainedTokenizer, + transformers.PreTrainedTokenizerFast, + ] + ] = None, + batch_size: Optional[Union[int, str]] = 1, + max_gen_toks: Optional[int] = 256, + max_length: Optional[int] = None, + ): + super().__init__() + + try: + import deepsparse + except ModuleNotFoundError: + raise Exception( + "Package `deepsparse` is not installed. " + "Please install it via `pip install deepsparse[transformers]`" + ) + + if isinstance(batch_size, str) and not batch_size.isdigit(): + eval_logger.warning( + f"batch_size={batch_size} is not valid for deepsparse because it is not an integer. " + "Ignoring and using the default of 1." + ) + batch_size = 1 + + self.batch_size = int(batch_size) + self._max_length = max_length if max_length else self._DEFAULT_MAX_LENGTH + self._max_gen_toks = max_gen_toks + self.batch_sizes = {} + + # Initialize new model and tokenizer instances + self.model = deepsparse.TextGeneration( + model_path=pretrained, + sequence_length=self._max_length, + batch_size=batch_size, + ) + self.tokenizer = tokenizer if tokenizer else self.model.tokenizer + self.config = self.model.config + + def tok_encode(self, string: str) -> List[int]: + return self.tokenizer.encode(string) + + def tok_decode(self, tokens: List[int]) -> str: + return self.tokenizer.decode(tokens) + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def prefix_token_id(self): + # it is used as prefix for loglikelihood + if self.tokenizer.bos_token_id is not None: + return self.tokenizer.bos_token_id + return self.tokenizer.eos_token_id + + @property + def max_length(self) -> int: + return self._max_length + + @property + def max_gen_toks(self) -> int: + return self._max_gen_toks + + def loglikelihood(self, requests) -> List[Tuple[float, bool]]: + """ + Copied directly from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py + """ + new_reqs = [] + for context, continuation in [req.args for req in requests]: + if context == "": + raise NotImplementedError( + "Implementing empty context is not supported yet" + ) + context_enc, continuation_enc = self._encode_pair(context, continuation) + + new_reqs.append(((context, continuation), context_enc, continuation_enc)) + + return self._loglikelihood_tokens(new_reqs) + + def _loglikelihood_tokens( + self, + requests: List[Tuple[Tuple[str, str], List[int], List[int]]], + disable_tqdm: bool = False, + ) -> List[Tuple[float, bool]]: + """ + The function to compute the loglikelihood of the continuation + tokens given the context tokens. + + This function is an adapted version of the original function from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py + """ + res = [] + + def _collate(x): + """Defines the key for the sorted method""" + toks = x[1] + x[2] + return -len(toks), tuple(toks) + + re_ord = utils.Reorderer(requests, _collate) + + for chunk in tqdm( + list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)), + disable=disable_tqdm, + ): + batch_inp = [] + batch_cache_key = [] + batch_continuation_enc = [] + # len(chunk) is the batch_size + for cache_key, context_enc, continuation_enc in chunk: + # how this all works (illustrated on a causal decoder-only setup): + # CTX CONT + # inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] + # model \ \ + # logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the + # cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice # noqa: E501 + + inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1] + + batch_inp.append(self.tokenizer.decode(inp)) + batch_cache_key.append(cache_key) + batch_continuation_enc.append(continuation_enc) + + response = self.model( + prompt=batch_inp, + max_new_tokens=0, + output_scores=True, + include_prompt_logits=True, + ) + + for resp, continuation_enc, cache_key in zip( + response.generations, batch_continuation_enc, batch_cache_key + ): + # (seq_len, vocab_size) + multi_scores = resp.score + + from deepsparse.utils.data import numpy_log_softmax + + # (seq_len, vocab_size) but with softmax applied + multi_logits = numpy_log_softmax(multi_scores, axis=1) + # toss out the context half of the sequence + # (cont_len, vocab_size) + continuation_multi_logits = multi_logits[-len(continuation_enc) :] + + # pick out the logits for the continuation tokens + # (cont_len,) + continuation_logits = continuation_multi_logits[ + numpy.arange(len(continuation_enc)), continuation_enc + ] + # check if the tokens generated greedly are the same + # as the expected continuation + greedy_tokens = continuation_multi_logits.argmax(axis=1) + max_equal = greedy_tokens.tolist() == continuation_enc + + # Answer: (log prob, is-exact-match) + answer = (float(continuation_logits.sum()), bool(max_equal)) + + res.append(answer) + + if cache_key is not None: + self.cache_hook.add_partial("loglikelihood", cache_key, answer) + + return re_ord.get_original(res) + + def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]: + raise NotImplementedError( + "The method not required by any of our current task integrations so far" + ) + + def generate_until(self, requests: List[Instance]) -> List[str]: + """ + The function to generate a certain number of new tokens + given a context. + + This function is an adapted version of the original function from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py + """ + if not requests: + return [] + res = [] + requests = [req.args for req in requests] + + def _collate(x): + toks = self.tok_encode(x[0]) + return len(toks), x[0] + + re_ord = utils.Reorderer(requests, _collate) + + def sameuntil_chunks(xs, size): + ret = [] + lastuntil = xs[0][1] + for x in xs: + if len(ret) >= size or x[1] != lastuntil: + yield ret, lastuntil + ret = [] + lastuntil = x[1] + ret.append(x) + + if ret: + yield ret, lastuntil + + pbar = tqdm(total=len(requests)) + for chunk, request_args in tqdm( + list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)) + ): + inps = [] + + # make a deepcopy since we are changing arguments + request_args = copy.deepcopy(request_args) + + self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks) + + for context, _ in chunk: + # add context (prompts) to the list + inps.append(context) + + until = request_args.pop("until", ["<|endoftext|>"]) + request_args.pop("do_sample", None) + request_args["temperature"] = request_args.get("temperature", 0) + + # run inference (generate max_gen_toks tokens) + out = self.model( + sequences=inps, + max_new_tokens=self.max_gen_toks - 1, + stop=until, + **request_args, + ) + + for resp, (context, args_) in zip(out.generations, chunk): + text = resp.text + until_ = until + # split the text at the first occurrence of any of the until tokens + for term in until_: + if len(term) > 0: + text = text.split(term)[0] + + res.append(text) + + self.cache_hook.add_partial( + "generate_until", (context, {"until": until_}), text + ) + pbar.update(1) + + pbar.close() + + return re_ord.get_original(res) + + def _encode_pair( + self, context: str, continuation: str + ) -> Tuple[List[int], List[int]]: + """ + Copied directly from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py + """ + n_spaces = len(context) - len(context.rstrip()) + if n_spaces > 0: + continuation = context[-n_spaces:] + continuation + context = context[:-n_spaces] + whole_enc = self.tok_encode(context + continuation) + context_enc = self.tok_encode(context) + context_enc_len = len(context_enc) + continuation_enc = whole_enc[context_enc_len:] + return context_enc, continuation_enc diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/openai_completions.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/openai_completions.py new file mode 100644 index 0000000000000000000000000000000000000000..26dc93d68f469d69e9d165b6a3a0ba87a3055780 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/openai_completions.py @@ -0,0 +1,222 @@ +import os +from functools import cached_property +from typing import Any, Dict, List, Optional, Tuple, Union + +from lm_eval.api.registry import register_model +from lm_eval.models.api_models import TemplateAPI +from lm_eval.utils import eval_logger + + +@register_model("local-completions") +class LocalCompletionsAPI(TemplateAPI): + def __init__( + self, + base_url=None, + tokenizer_backend="huggingface", + **kwargs, + ): + super().__init__( + base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs + ) + + def _create_payload( + self, + messages: Union[List[List[int]], List[dict], List[str], str], + generate=False, + gen_kwargs: Optional[dict] = None, + seed: int = 1234, + **kwargs, + ) -> dict: + if generate: + gen_kwargs.pop("do_sample", False) + max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks) + temperature = gen_kwargs.pop("temperature", 0) + stop = gen_kwargs.pop("until", ["<|endoftext|>"]) + return { + "prompt": messages, + "model": self.model, + "max_tokens": max_tokens, + "temperature": temperature, + "stop": stop, + "seed": seed, + **gen_kwargs, + } + else: + return { + "model": self.model, + "prompt": messages, + "temperature": 0, + "max_tokens": 1, + "logprobs": 1, + "seed": seed, + "echo": True, + } + + @staticmethod + def parse_logprobs( + outputs: Union[Dict, List[Dict]], + tokens: List[List[int]] = None, + ctxlens: List[int] = None, + **kwargs, + ) -> List[Tuple[float, bool]]: + res = [] + if not isinstance(outputs, list): + outputs = [outputs] + for out in outputs: + for choice, ctxlen in zip(out["choices"], ctxlens): + assert ctxlen > 0, "Context length must be greater than 0" + logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1]) + tokens = choice["logprobs"]["token_logprobs"][ctxlen:-1] + top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1] + is_greedy = True + for tok, top in zip(tokens, top_logprobs): + if tok != max(top, key=top.get): + is_greedy = False + break + res.append((logprobs, is_greedy)) + return res + + @staticmethod + def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]: + res = [] + if not isinstance(outputs, list): + outputs = [outputs] + for out in outputs: + for choices in out["choices"]: + res.append(choices["text"]) + return res + + @property + def api_key(self): + return os.environ.get("OPENAI_API_KEY", "") + + +@register_model("local-chat-completions") +class LocalChatCompletion(LocalCompletionsAPI): + def __init__( + self, + base_url=None, + tokenizer_backend=None, + tokenized_requests=False, + **kwargs, + ): + eval_logger.warning( + "chat-completions endpoint requires the `--apply_chat_template` flag." + ) + super().__init__( + base_url=base_url, + tokenizer_backend=tokenizer_backend, + tokenized_requests=tokenized_requests, + **kwargs, + ) + if self._batch_size > 1: + eval_logger.warning( + "Chat completions does not support batching. Defaulting to batch size 1." + ) + self._batch_size = 1 + + def _create_payload( + self, + messages: List[Dict], + generate=False, + gen_kwargs: dict = None, + seed=1234, + **kwargs, + ) -> dict: + gen_kwargs.pop("do_sample", False) + max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks) + temperature = gen_kwargs.pop("temperature", 0) + stop = gen_kwargs.pop("until", ["<|endoftext|>"]) + if not isinstance(stop, (list, tuple)): + stop = [stop] + return { + "messages": messages, + "model": self.model, + "max_tokens": max_tokens, + "temperature": temperature, + "stop": stop[:4], + "seed": seed, + **gen_kwargs, + } + + @staticmethod + def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]: + res = [] + if not isinstance(outputs, list): + outputs = [outputs] + for out in outputs: + for choices in out["choices"]: + res.append(choices["message"]["content"]) + return res + + def tok_encode( + self, + string: Union[str, Any], + left_truncate_len=None, + add_special_tokens=None, + **kwargs, + ) -> Union[List[str], List[int], Any]: + return string + + def loglikelihood(self, requests, **kwargs): + raise NotImplementedError( + "Loglikelihood is not supported for chat completions. Consider using the completions API instead." + ) + + +@register_model( + "openai-completions", +) +class OpenAICompletionsAPI(LocalCompletionsAPI): + def __init__( + self, + base_url="https://api.openai.com/v1/completions", + tokenizer_backend="tiktoken", + **kwargs, + ): + super().__init__( + base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs + ) + + @cached_property + def api_key(self): + """Override this property to return the API key for the API request.""" + key = os.environ.get("OPENAI_API_KEY", None) + if key is None: + raise ValueError( + "API key not found. Please set the OPENAI_API_KEY environment variable." + ) + return key + + def loglikelihood(self, requests, **kwargs): + assert ( + self.model != "gpt-3.5-turbo" + ), "Loglikelihood is not supported for gpt-3.5-turbo" + return super().loglikelihood(requests, **kwargs) + + +@register_model("openai-chat-completions") +class OpenAIChatCompletion(LocalChatCompletion): + def __init__( + self, + base_url="https://api.openai.com/v1/chat/completions", + tokenizer_backend=None, + tokenized_requests=False, + **kwargs, + ): + super().__init__( + base_url=base_url, + tokenizer_backend=tokenizer_backend, + tokenized_requests=tokenized_requests, + **kwargs, + ) + + @cached_property + def api_key(self): + """Override this property to return the API key for the API request.""" + key = os.environ.get("OPENAI_API_KEY", None) + if key is None: + raise ValueError( + "API key not found. Please set the OPENAI_API_KEY environment variable." + ) + return key diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/optimum_lm.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/optimum_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..70d44abdaca859fa79bd1beed789c96ad2c22ca9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/optimum_lm.py @@ -0,0 +1,87 @@ +import json +from importlib.util import find_spec +from pathlib import Path + +from lm_eval import utils +from lm_eval.api.registry import register_model +from lm_eval.models.huggingface import HFLM + + +eval_logger = utils.eval_logger + + +@register_model("openvino") +class OptimumLM(HFLM): + """ + Optimum Intel provides a simple interface to optimize Transformer models and convert them to \ + OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \ + Intel® architectures using OpenVINO™ runtime. + + To use an OpenVINO config, use `--model_args ov_config` to point to a json file with an OpenVINO config: + `lm_eval --model openvino --model_args pretrained=gpt2,ov_config=config.json --task lambada_openai` + Example json file contents: {"INFERENCE_PRECISION_HINT": "f32", "CACHE_DIR": "model_cache"} + """ + + def __init__( + self, + device="cpu", + **kwargs, + ) -> None: + if "backend" in kwargs: + # optimum currently only supports causal models + assert ( + kwargs["backend"] == "causal" + ), "Currently, only OVModelForCausalLM is supported." + + self.openvino_device = device + + super().__init__( + device=self.openvino_device, + backend=kwargs.pop("backend", "causal"), + **kwargs, + ) + + def _create_model( + self, + pretrained: str, + revision="main", + dtype="auto", + trust_remote_code=False, + **kwargs, + ) -> None: + if not find_spec("optimum"): + raise Exception( + "package `optimum` is not installed. Please install it via `pip install optimum[openvino]`" + ) + else: + from optimum.intel.openvino import OVModelForCausalLM + + model_kwargs = kwargs if kwargs else {} + if "ov_config" in model_kwargs: + if not Path(model_kwargs["ov_config"]).exists(): + raise ValueError( + "ov_config should point to a .json file containing an OpenVINO config" + ) + with open(model_kwargs["ov_config"]) as f: + model_kwargs["ov_config"] = json.load(f) + eval_logger.info( + f"Using custom OpenVINO config: {model_kwargs['ov_config']}" + ) + + else: + model_kwargs["ov_config"] = {} + model_kwargs["ov_config"].setdefault("CACHE_DIR", "") + model_file = Path(pretrained) / "openvino_model.xml" + if model_file.exists(): + export = False + else: + export = True + + self._model = OVModelForCausalLM.from_pretrained( + pretrained, + revision=revision, + trust_remote_code=trust_remote_code, + export=export, + device=self.openvino_device.upper(), + **model_kwargs, + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/textsynth.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/textsynth.py new file mode 100644 index 0000000000000000000000000000000000000000..a14f6287b6f11b21cfc69ca471bcbe99a631be12 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/textsynth.py @@ -0,0 +1,172 @@ +"""TextSynth API +Implementation provided by Fabrice Bellard: + https://github.com/EleutherAI/lm-evaluation-harness/issues/295 + +In order to use the API, you must have a valid TextSynth account and +enough credits. + +Example usage: + + python main.py --model textsynth --model_args engine=gptj_6B --no_cache --tasks piqa + +Homepage: https://textsynth.com/index.html +""" + +import logging +import os + +import requests as _requests +from tqdm import tqdm + +from lm_eval.api.model import LM +from lm_eval.api.registry import register_model +from lm_eval.models.utils import retry_on_specific_exceptions + + +logger = logging.getLogger(__name__) + + +def textsynth_completion(**kwargs): + """Query TextSynth API for completion. + Retry with back-off until they respond. + """ + + def _exception_callback(e: Exception, sleep_time: float) -> None: + import traceback + + traceback.print_exc() + + @retry_on_specific_exceptions( + on_exceptions=[_requests.exceptions.RequestException], + max_retries=None, # retry forever, consider changing + on_exception_callback=_exception_callback, + ) + def completion(): + return _requests.post(**kwargs) + + return completion() + + +@register_model("textsynth") +class TextSynthLM(LM): + def __init__(self, engine, truncate: bool = False, **kwargs) -> None: + """ + :param engine: str + TextSynth API engine (e.g. `gptj_6B`) + :param truncate: bool + Truncate input if too long (if False and input is too long, throw error) + """ + super().__init__() + + self.engine = engine + self.truncate = truncate + self.api_url = "https://api.textsynth.com" + # Read from environment variable TEXTSYNTH_API_SECRET_KEY + self.api_key = os.environ["TEXTSYNTH_API_SECRET_KEY"] + + @property + def eot_token_id(self): + # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until + raise NotImplementedError() + + @property + def max_length(self) -> int: + # NOTE: Turn on truncation to avoid errors on long inputs. + return 2048 + + @property + def max_gen_toks(self) -> int: + return 256 + + @property + def batch_size(self): + # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until + raise NotImplementedError() + + @property + def device(self): + # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until + raise NotImplementedError() + + def tok_encode(self, string: str): + # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until + raise NotImplementedError() + + def tok_decode(self, tokens): + # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until + raise NotImplementedError() + + def loglikelihood(self, requests, disable_tqdm: bool = False): + res = [] + for context, continuation in tqdm(requests, disable=disable_tqdm): + response = textsynth_completion( + url=self.api_url + "/v1/engines/" + self.engine + "/logprob", + headers={"Authorization": "Bearer " + self.api_key}, + json={"context": context, "continuation": continuation}, + ) + resp = response.json() + if "logprob" in resp: + logprob = resp["logprob"] + is_greedy = resp["is_greedy"] + res.append((logprob, is_greedy)) + + self.cache_hook.add_partial( + "loglikelihood", (context, continuation), (logprob, is_greedy) + ) + else: + logger.error( + f"The following response does not contain `logprobs`. Got:\n{resp}" + ) + assert False + return res + + def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): + # TODO: The TextSynth API does not support tokenized inputs so we cannot + # manually partition long contexts into smaller rolling windows as + # done for other models derived from `BaseLM`. Override this method + # with a windowing scheme that works for direct string inputs. + raise NotImplementedError( + "`loglikelihood_rolling` is currently not supported due to lack of " + "input tokenization support from TextSynth." + ) + + def generate_until(self, requests, disable_tqdm: bool = False): + if not requests: + return [] + + res = [] + for request in tqdm(requests, disable=disable_tqdm): + inp = request[0] + request_args = request[1] + until = request_args["until"] + response = textsynth_completion( + url=self.api_url + "/v1/engines/" + self.engine + "/completions", + headers={"Authorization": "Bearer " + self.api_key}, + json={ + "prompt": inp, + "max_tokens": self.max_gen_toks, + "top_k": 1, + "stop": until, + }, + ) + resp = response.json() + if "text" in resp: + s = resp["text"] + res.append(s) + + self.cache_hook.add_partial("generate_until", (inp, request_args), s) + else: + logger.error( + "The following response does not contain generated `text`. " + "Got:\n{resp}" + ) + assert False + return res + + def _model_call(self, inps): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + def _model_generate(self, context, max_length, eos_token_id): + # Isn't used because we override generate_until + raise NotImplementedError() diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8a81e5deca280f4e48b584a4eac78fb44d1feda2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/utils.py @@ -0,0 +1,666 @@ +import collections +import fnmatch +import gc +import itertools +import time +from functools import wraps +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Literal, + Optional, + Tuple, + Type, + Union, +) + +import torch +import transformers + +from lm_eval.utils import eval_logger + + +if TYPE_CHECKING: + from transformers import PreTrainedTokenizerBase + from transformers.configuration_utils import PretrainedConfig + + +def chunks(iter, n: int = 0, fn=None): + """ + Divides an iterable into chunks of specified size or based on a given function. + Useful for batching + + Parameters: + - iter: The input iterable to be divided into chunks. + - n: An integer representing the size of each chunk. Default is 0. + - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None. + + Returns: + An iterator that yields chunks of the input iterable. + + Example usage: + ``` + data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + for chunk in chunks(data, 3): + print(chunk) + ``` + Output: + ``` + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + [10] + ``` + """ + arr = [] + for i, x in enumerate(iter): + arr.append(x) + if len(arr) == (fn(i, iter) if fn else n): + yield arr + arr = [] + + if arr: + yield arr + + +class MultiChoice: + def __init__(self, choices) -> None: + self.choices = choices + + # Simple wildcard support (linux filename patterns) + def __contains__(self, values) -> bool: + for value in values.split(","): + if len(fnmatch.filter(self.choices, value)) == 0: + eval_logger.info("Available tasks to choose:") + for choice in self.choices: + eval_logger.info(f" - {choice}") + raise ValueError("'{}' is not in task list".format(value)) + return True + + def __iter__(self) -> Iterator: + for choice in self.choices: + yield choice + + +class Grouper: + """ + takes an array `arr` and function `fn` and returns a dictionary + with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all + objects in `arr` satisfying `key == fn(ob)`. + """ + + def __init__(self, arr, fn) -> None: + # self.orig_arr = arr + self.size = len(arr) + arr = list(enumerate(arr)) + + def group_return_dict(arr, fn): + res = collections.defaultdict(list) + + for ob in arr: + res[fn(ob)].append(ob) + return res + + arr = group_return_dict(arr, lambda x: fn(x[1])) + + # self.arr has format Dict[Tuple[int, ]] + self.arr = arr + self._grouped = None + + def get_grouped(self): + # return the contents but not indices for our grouped dict. + if self._grouped: + return self._grouped + grouped = {} + for key in self.arr.keys(): + # drop the index from each element of self.arr + grouped[key] = [y[1] for y in self.arr[key]] + self._grouped = grouped + return grouped + + def get_original(self, grouped_dict): + # take in a grouped dictionary with e.g. results for each key listed + # in the same order as the instances in `self.arr`, and + # return the results in the same (single list) order as `self.orig_arr`. + res = [None] * self.size + cov = [False] * self.size + # orig = [None] * self.size + + assert grouped_dict.keys() == self.arr.keys() + + for key in grouped_dict.keys(): + for (ind, _), v in zip(self.arr[key], grouped_dict[key]): + res[ind] = v + cov[ind] = True + # orig[ind] = _ + + assert all(cov) + # assert orig == self.orig_arr + + return res + + +def pad_and_concat( + max_length: int, + tensors: List[torch.Tensor], + padding_side: Literal["right", "left"] = "right", +): + """ + Method for padding a list of tensors given the maximum tensor + length in the batch. Used for batching inputs and continuations in + seq2seq models. + """ + assert ( + padding_side == "left" or padding_side == "right" + ), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'" + + for i, tensor in enumerate(tensors): + if len(tensor.shape) == 2: + tensor = tensor.squeeze(0) # squeeze, in case passed [1, seq] size + tensor_len = tensor.shape[0] + if tensor_len < max_length: + if padding_side == "right": + # right-pad + tensors[i] = torch.cat( + [ + tensor, # [seq] + torch.zeros( + max_length - tensor_len, + dtype=torch.long, + device=tensor.device, + ), # [padding_length - seq] + ], + dim=0, + ).unsqueeze(0) + else: + # left-pad + tensors[i] = torch.cat( + [ + torch.zeros( + max_length - tensor_len, + dtype=torch.long, + device=tensor.device, + ), # [padding_length - seq] + tensor, # [seq] + ], + dim=0, + ).unsqueeze(0) + else: + tensors[i] = tensor.unsqueeze(0) + + return torch.cat(tensors, dim=0) + + +def clear_torch_cache() -> None: + gc.collect() + torch.cuda.empty_cache() + + +def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype: + """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig""" + if isinstance(dtype, str) and dtype != "auto": + # Convert `str` args torch dtype: `float16` -> `torch.float16` + _torch_dtype = getattr(torch, dtype) + else: + _torch_dtype = dtype + return _torch_dtype + + +class MultiTokenEOSCriteria(transformers.StoppingCriteria): + """Criteria to stop on the specified multi-token sequence.""" + + def __init__( + self, + sequence: str, + tokenizer: transformers.PreTrainedTokenizer, + initial_decoder_input_length: int, + batch_size: int, + ) -> None: + self.initial_decoder_input_length = initial_decoder_input_length + self.done_tracker = [False] * batch_size + self.sequence = sequence + self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False) + # print(sequence, self.sequence_ids) + # we look back for 2 more tokens than it takes to encode our stop sequence + # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']` + # and we don't want to mistakenly not stop a generation because our + # (string) stop sequence was output in a different tokenization + + # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model, + # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized + # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described. + self.sequence_id_len = len(self.sequence_ids) + 2 + self.tokenizer = tokenizer + + def __call__(self, input_ids, scores, **kwargs) -> bool: + # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence + lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :] + + lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :] + + lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch) + + for i, done in enumerate(self.done_tracker): + if not done: + self.done_tracker[i] = self.sequence in lookback_tokens_batch[i] + return False not in self.done_tracker + + +def stop_sequences_criteria( + tokenizer: transformers.PreTrainedTokenizer, + stop_sequences: List[str], + initial_decoder_input_length: int, + batch_size: int, +) -> transformers.StoppingCriteriaList: + return transformers.StoppingCriteriaList( + [ + *[ + MultiTokenEOSCriteria( + sequence, tokenizer, initial_decoder_input_length, batch_size + ) + for sequence in stop_sequences + ], + ] + ) + + +def undistribute(iterable): + """ + Undoes https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.distribute . + + Re-interleaves results that have been split using more_itertools.distribute: + >>> group_1, group_2 = distribute(2, [1, 2, 3, 4, 5, 6]) + >>> list(group_1) + [1, 3, 5] + >>> list(group_2) + [2, 4, 6] + >>> undistribute([group_1, group_2]) + [1, 2, 3, 4, 5, 6] + + Handles non-uniform component lengths: + + >>> children = distribute(3, [1, 2, 3, 4, 5, 6, 7]) + >>> [list(c) for c in children] + [[1, 4, 7], [2, 5], [3, 6]] + >>> undistribute(children) + [1, 2, 3, 4, 5, 6, 7] + + Also handles when some iterables are empty: + + >>> children = distribute(5, [1, 2, 3]) + >>> [list(c) for c in children] + [[1], [2], [3], [], []] + >>> undistribute(children) + [1, 2, 3] + + """ + + return [ + x + for x in itertools.chain.from_iterable( + itertools.zip_longest(*[list(x) for x in iterable]) + ) + if x is not None + ] + + +def retry_on_specific_exceptions( + on_exceptions: List[Type[Exception]], + max_retries: Optional[int] = None, + backoff_time: float = 3.0, + backoff_multiplier: float = 1.5, + on_exception_callback: Optional[Callable[[Exception, float], Any]] = None, +): + """Retry on an LLM Provider's rate limit error with exponential backoff + For example, to use for OpenAI, do the following: + ``` + from openai import RateLimitError + + # Recommend specifying max_retries to avoid infinite loops! + @retry_on_specific_exceptions([RateLimitError], max_retries=3) + def completion(...): + # Wrap OpenAI completion function here + ... + ``` + """ + + def decorator(func: Callable): + @wraps(func) + def wrapper(*args, **kwargs): + sleep_time = backoff_time + attempt = 0 + while max_retries is None or attempt < max_retries: + try: + return func(*args, **kwargs) + except tuple(on_exceptions) as e: + if on_exception_callback is not None: + on_exception_callback(e, sleep_time) + time.sleep(sleep_time) + sleep_time *= backoff_multiplier + attempt += 1 + + return wrapper + + return decorator + + +class Collator: + """ + A class for reordering and batching elements of an array. + + This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data. + + Objects of this class have the group_by attribute which determines the method for grouping + the data while batching it. Three options include "gen_kwargs", "contexts", or None: + If group_by == "gen_kwargs" then requests will be grouped by gen_kwargs + If group_by == "contexts" then requests will be grouped by context + cont[:-1] + If None then requests will just be reordered by length descending. + """ + + def __init__( + self, + arr: List, + sort_fn: Callable = lambda x: x, + group_fn: Callable = lambda x: x[1], + group_by: Union[Literal["gen_kwargs", "contexts"], None] = None, + ) -> None: + self._group_by = group_by + # 0 indices are enumerated indices. Apply functions to original arr. + self._sort_fn = lambda x: sort_fn(x[1]) + self._group_fn = lambda x: group_fn(x[1]) + self._reorder_indices: List = [] + self._size = len(arr) + self._arr_with_indices: Union[Dict, Tuple[Tuple[int, Any], ...]] = tuple( + enumerate(arr) + ) # [indices, (arr)] + if self._group_by == "contexts": + self._group_by_context() + elif self._group_by == "gen_kwargs": + self._group_by_index() + + def _group_by_index(self) -> None: + """Group the elements of a list based on their indices.""" + self._arr_with_indices = self.group( + self._arr_with_indices, fn=self._group_fn, group_by="gen_kwargs" + ) + + def _group_by_context(self) -> None: + """Group the array with indices by context.""" + self._arr_with_indices = self.group( + self._arr_with_indices, fn=self._group_fn, group_by="contexts" + ) + + def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator: + """ + Generates and yields batches from the reordered array. The method of grouping and batching + depends on the parameter `group_by`. + If `group_by` is set to "gen_kwargs", it will batch the + re-ordered values with same gen_kwargs for each batch. + If `group_by` is "contexts", it caches the requests by context before batching. + If `group_by` is neither "gen_kwargs" nor "contexts", it yields the reordered array + + Parameters: + - n (int): The size of each batch. Defaults to 1. + - batch_fn ([Callable[[int, Iterable], int]] | None): A function to determine the size of + each batch. Optional, defaults to None. + + Returns: + Iterator: An iterator over batches of reordered elements grouped as per the `group_by` + attribute. + + Yields: + List of batched elements according to the `group_by` attribute. + """ + if self._group_by == "gen_kwargs": + for ( + key, + values, + ) in self._arr_with_indices.items(): # type: ignore + values = self._reorder(values) + batch = self.get_chunks(values, n=n, fn=batch_fn) + yield from batch + elif self._group_by == "contexts": + # Get one sample from each key + values = self._reorder( + [value[0] for value in self._arr_with_indices.values()] + ) + batch = self.get_chunks(values, n=n, fn=batch_fn) + yield from batch + else: + values = self._reorder(self._arr_with_indices) # type: ignore + batch = self.get_chunks(values, n=n, fn=batch_fn) + yield from batch + + def get_cache( + self, + req_str: Tuple[str, str] = None, + cxt_toks: List[int] = None, + cont_toks: List[int] = None, + logits: torch.Tensor = None, + ) -> Iterator[Tuple[Tuple[str, str], List[int], torch.Tensor]]: + """ + Retrieves cached single-token continuations and their associated arguments, updating indices as necessary. + + The behavior of this function varies depending on how the `group_by` attribute is set: + + - When `group_by` is "contexts": + The function identifies single-token continuations by checking for keys that equate to + [context+continuation][-1] and logs the indices for re-ordering. + In this mode, this function can work in two scenarios: + + 1. Cache Hit - Single Match: + If a single matching context-continuation pair is found in the cache, + the function yields the original arguments. + + 2. Cache Hit - Multiple Matches: + If multiple matching context-continuation pairs are found in the cache, + the function expands the logits batch dimension to match the number of cache hits. + It updates the original requests and continuation tokens. + + - When `group_by` is not set to "contexts": + This method yields the original arguments, logits and continuation tokens, + without checking for one-token continuations. + + Parameters: + - req_str (tuple[str, str]): Original strings used for CachingLM. + - cxt_toks (list[int]): Full context tokens used for lookup. + - cont_toks (list[int]): Continuation tokens for which logits were generated. + - logits (torch.Tensor [1, seq_length, vocab_size]): Logits generated by the model given context and continuation keys. + + Yields: + - Iterator: + - req_str (tuple[str, str]): strings used for CachingLM. + - cont_toks (list[int]) : continuation tokens. + - logits (torch.Tensor [1, seq_length, vocab_size]): The original logits (repeated cache hit times) + """ + if self._group_by == "contexts": + cache_hit: List[ + Tuple[int, Tuple[Tuple[str, str], List[int], List[int]]] + ] = self._arr_with_indices.pop(tuple(cxt_toks + cont_toks[:-1])) + if (cache_size := len(cache_hit)) == 1: + self._reorder_indices.extend(x[0] for x in cache_hit) + yield req_str, cont_toks, logits + else: + # If we have matching requests then expand the batch dimension (no-op) and + # yield each along with its corresponding args. + multilogits = logits.expand(cache_size, -1, -1).chunk(cache_size) + indices, req_str, cont_toks = zip( + *[(x[0], x[1][0], x[-1][-1]) for x in cache_hit] + ) + self._reorder_indices.extend(indices) + for c_key, cont_tok, logit in zip(req_str, cont_toks, multilogits): + yield c_key, cont_tok, logit + else: + yield req_str, cont_toks, logits + + def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> Iterator: + """ + Reorders the elements in the array based on the sorting function. + + Parameters: + - arr (list | tuple[tuple[int, Any], ...]]): The array or iterable to be reordered. + + Yields: + Iterator + """ + arr = sorted(arr, key=self._sort_fn) + if not self._group_by == "contexts": + # If grouped by contexts then indices will be set in get_cache() + self._reorder_indices.extend([x[0] for x in arr]) + yield from [x[1] for x in arr] + + def get_original(self, newarr: List) -> List: + """ + Restores the original order of elements from the reordered list. + + Parameters: + - newarr (list): The reordered array. + + Returns: + list: The array with elements restored to their original order. + """ + res = [None] * self._size + cov = [False] * self._size + + for ind, v in zip(self._reorder_indices, newarr): + res[ind] = v + cov[ind] = True + + assert all(cov) + + return res + + def __len__(self): + return self._size + + @staticmethod + def group( + arr: Iterable, + fn: Callable, + group_by: Literal["gen_kwargs", "contexts"] = "gen_kwargs", + ) -> dict: + """ + Groups elements of an iterable based on a provided function. + + + The `group_by` parameter determines the method of grouping. + If `group_by` is "contexts", the elements are grouped by [context + cont][:-1]. + If `group_by` is "gen_kwargs", the elements are grouped based on the gen_kwargs dict. + + Parameters: + - arr (Iterable): The iterable to be grouped. + - fn (Callable): The function to determine the grouping. + - values (bool): If True, returns the values of the group. Defaults to False. + + Returns: + Iterator: An iterable of grouped elements. + """ + res = collections.defaultdict(list) + for ob in arr: + # where ob == [context + cont] + if group_by == "contexts": + res[tuple(fn(ob))].append(ob) + else: + try: + hashable_dict = tuple( + ( + key, + tuple(value) + if isinstance(value, collections.abc.Iterable) + else value, + ) + for key, value in sorted(fn(ob).items()) + ) + res[hashable_dict].append(ob) + except (TypeError, AttributeError): + res[tuple(fn(ob))].append(ob) + return res + + @staticmethod + def get_chunks(_iter, n: int = 0, fn=None): + """ + Divides an iterable into chunks of specified size or based on a given function. + Useful for batching + + Parameters: + - iter: The input iterable to be divided into chunks. + - n: An integer representing the size of each chunk. Default is 0. + - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None. + + Returns: + An iterator that yields chunks of the input iterable. + + Example usage: + ``` + data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + for chunk in chunks(data, 3): + print(chunk) + ``` + Output: + ``` + [1, 2, 3] + [4, 5, 6] + [7, 8, 9] + [10] + ``` + """ + arr = [] + _iter = tuple(_iter) + for i, x in enumerate(_iter): + arr.append(x) + if len(arr) == (fn(i, _iter) if fn else n): + yield arr + arr = [] + + if arr: + yield arr + + +def configure_pad_token( + tokenizer: "PreTrainedTokenizerBase", + model_config: Optional["PretrainedConfig"] = None, +) -> "PreTrainedTokenizerBase": + """ + This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present. + Some tokenizers require special handling. + + Args: + tokenizer: The tokenizer for which the padding token is to be handled. + model_config: The configuration of the model. Default is None. + + Returns: + The tokenizer after the padding token has been handled. + + Raises: + AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0. + """ + if tokenizer.pad_token: + pass + elif tokenizer.unk_token: + tokenizer.pad_token_id = tokenizer.unk_token_id + elif tokenizer.eos_token: + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + # handle special cases + if model_config and getattr(model_config, "model_type", None) == "qwen": + # Qwen's trust_remote_code tokenizer does not allow for adding special tokens + tokenizer.pad_token = "<|endoftext|>" + elif ( + tokenizer.__class__.__name__ == "RWKVWorldTokenizer" + or tokenizer.__class__.__name__ == "Rwkv5Tokenizer" + ): + # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0) + # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer + # --- + # Note that the world tokenizer class name, might change in the future for the final huggingface merge + # https://github.com/huggingface/transformers/pull/26963 + assert tokenizer.pad_token_id == 0 + else: + tokenizer.add_special_tokens({"pad_token": "<|pad|>"}) + + return tokenizer diff --git a/scripts/yans/lm-evaluation-harness/scripts/__init__.py b/scripts/yans/lm-evaluation-harness/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/yans/lm-evaluation-harness/scripts/build_benchmark.py b/scripts/yans/lm-evaluation-harness/scripts/build_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..fc99b5ec37c6979bf55f6a1ac0ea6808fd0e539f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/build_benchmark.py @@ -0,0 +1,61 @@ +import argparse +import os + +import yaml +from promptsource.templates import DatasetTemplates +from tqdm import tqdm + +# from lm_eval.api.registry import ALL_TASKS +from lm_eval.logger import eval_logger + + +# from lm_eval.tasks import include_task_folder + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--benchmark_name", required=True) + parser.add_argument("--benchmark_path", required=True) + parser.add_argument("--task_save_path", default="lm_eval/tasks/") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + with open(args.benchmark_path, encoding="utf-8") as file: + TASK_LIST = yaml.full_load(file) + for task in tqdm(TASK_LIST): + eval_logger.info(f"Processing {task}") + + dataset_name = task["dataset_path"] + if "dataset_name" in task: + subset_name = task["dataset_name"] + file_subdir = f"{dataset_name}/{subset_name}" + else: + subset_name = None + file_subdir = f"{dataset_name}" + + file_path = os.path.join(args.task_save_path, file_subdir, "promptsource/") + + os.makedirs(file_path, exist_ok=True) + + if subset_name is None: + prompts = DatasetTemplates(dataset_name=dataset_name) + else: + prompts = DatasetTemplates( + dataset_name=dataset_name, subset_name=subset_name + ) + + for idx, prompt_name in enumerate(prompts.all_template_names): + full_file_name = f"promptsource_{idx}.yaml" + config_dict = { + "group": args.benchmark_name, + "include": "promptsource_template.yaml", + "use_prompts": f"promptsource:{prompt_name}", + } + + file_save_path = os.path.join(file_path, full_file_name) + eval_logger.info(f"Save to {file_save_path}") + with open(file_save_path, "w", encoding="utf-8") as yaml_file: + yaml.dump(config_dict, yaml_file) diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/README.md b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7985adecaab926b39e5bfd5b96b093f73450e660 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/README.md @@ -0,0 +1,36 @@ +janitor.py contains a script to remove benchmark data contamination from training data sets. +It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.14165). + +## Algorithm +1) Collects all contamination text files that are to be removed from training data +2) Filters training data by finding `N`gram matches between the training data + and any contamination + 1) `N`grams ignore case and punctuation and are split on whitespace. + 2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around + the match, splitting the training data into chunks + 3) Any chunks less than `minimum_slice_length` are removed + 4) Training data sets split into more than `too_dirty_cutoff` are considered + completely contaminated and removed + +OpenAI used: +``` +ngram_n = 13 +window_to_remove = 200 +minimum_slice_length = 200 +too_dirty_cutoff = 10 +``` + +## Compiling + +Janitor can be used as a pure python program, but it is much faster if the ngram +code is run in C++. To compile the C++ code, run + +``` +pip install pybind11 +c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) +``` + +MacOS users: If your compiler isn't linked to Python, you may need to add to the above `-undefined dynamic_lookup`. \ +Linux users: If your compiler isn't linked to Python, you may need to follow these steps: +1. Rename the compiled code file to `janitor_util.so`. +2. Before running `import Janitor` in your code, add `sys.path.append("your/relative/path/to/janitor_util.so")` so that Python knows the location of `janitor_util.so`. diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/__init__.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/compress_and_package.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/compress_and_package.py new file mode 100644 index 0000000000000000000000000000000000000000..d4af5ba5f3d5e16a485984ced2324951e56ad829 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/compress_and_package.py @@ -0,0 +1,73 @@ +import argparse +import glob +import logging +import os +import shutil +import subprocess + +from tqdm import tqdm +from tqdm_multiprocess import TqdmMultiProcessPool +from tqdm_multiprocess.logger import setup_logger_tqdm + + +logger = logging.getLogger(__name__) + + +def process_task( + working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm +): + command = f"zstd {bucket_file_path}" + logger.info(command) + subprocess.call(command, shell=True) + + compressed_file = bucket_file_path + ".zst" + if output_directory: + shutil.move(compressed_file, output_directory) + + os.remove(bucket_file_path) + global_tqdm.update() + + +def compress_and_move(working_directory, output_directory, process_count): + os.makedirs(output_directory, exist_ok=True) + original_info_file_path = os.path.join(working_directory, "info.json") + assert os.path.exists(original_info_file_path) + + tasks = [] + bucket_file_paths = glob.glob( + os.path.join(working_directory, "output", "*.bkt.txt.sorted") + ) + for bucket_file_path in bucket_file_paths: + task = (process_task, (working_directory, output_directory, bucket_file_path)) + tasks.append(task) + + pool = TqdmMultiProcessPool(process_count) + + def on_done(_): + return None + + def on_error(_): + return None + + global_progress = tqdm( + total=len(bucket_file_paths), dynamic_ncols=True, unit="file" + ) + _ = pool.map(global_progress, tasks, on_error, on_done) + + shutil.copy(original_info_file_path, os.path.join(output_directory, "info.json")) + + +parser = argparse.ArgumentParser(description="sort 13gram buckets") +parser.add_argument("-dir", "--working_directory", required=True) +parser.add_argument("-output", "--output_directory", required=True) +parser.add_argument("-procs", "--process_count", type=int, default=8) + +if __name__ == "__main__": + version = 1.00 + print(f"Running version {version}") + + logfile_path = "compress_and_package.log" + setup_logger_tqdm(logfile_path) + + args = parser.parse_args() + compress_and_move(args.working_directory, args.output_directory, args.process_count) diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/generate_13_grams.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/generate_13_grams.py new file mode 100644 index 0000000000000000000000000000000000000000..e508f266e9bfbe1cdf6f93de478c2d60d490d557 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/generate_13_grams.py @@ -0,0 +1,215 @@ +""" +Outputs all 13-grams found in The Pile. + +Loops through all documents and uses the logic found in janitor.py to extract 13-grams. +We bucket each 13-gram by hash into separate file buckets to allow easy parallel processing in the +next stage. We also include the current pile document_id with each ngram instance to allow the +filtering to exclude 13-grams that match more then 10 unique documents (done further down the pipeline). + +We didn't use lm_dataformat to output as it increases time 4x (slow jsonify) and makes +resuming hard (and we had the storage). + +Arguments +--------- +--working_directory (-dir) + Directory containing the pile distribution. An "output" subdirectory will be created underneath + to store the bucketed 13-grams, checkpoint and done files. Default: current directory +--n_value (-n) + n value in n-gram, added for later use if ever needed. Default: 13 +--bucket_count (-buckets) + Number of file buckets to use when generating 13grams. Default: 500 +""" + +import argparse +import glob +import json +import logging +import os +import pickle +import signal +import sys +from pathlib import Path +from signal import SIGINT + +from tqdm import tqdm +from tqdm_multiprocess.logger import setup_logger_tqdm + +from lm_eval.decontamination.archiver import Reader, TextArchive +from lm_eval.decontamination.janitor import Janitor, word_ngrams + + +logger = logging.getLogger(__name__) + +terminate = False + + +def handler(signal_received, frame): + global terminate + terminate = True + + +def yield_pile(start_offsets=None, checkpoint_offset=None): + directory = "pile" + + if not os.path.exists(directory): + print( + "We expect the pile archives to be in the 'pile' directory, but this was not found." + ) + raise Exception("Pile directory not found.") + + files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*")))) + + pile_global_offset = 0 + start_file = 0 + if checkpoint_offset: + for file_i, start_offset in enumerate(start_offsets): + if start_offset > checkpoint_offset: + break + + start_file = file_i + pile_global_offset = start_offset + + for file_i, file in enumerate(files): + if file_i < start_file: + logger.info(f"Skipping file {file}") + continue + logger.info(f"Reading from pile file: {file}") + reader = Reader() + for document in reader.read(file): + yield (pile_global_offset, document) + pile_global_offset += 1 + + +# Hash buckets > disk backed files. Supports file position checkpointing and resuming +# Allows you to write continuously and checkpoint intermittently. If a failure occurs +# the buckets are simply truncated at your last checkpoint. +class Buckets: + def __init__(self, directory, num_buckets): + self.bucket_files = [ + os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets) + ] + self.buckets = list(map(TextArchive, self.bucket_files)) + self.checkpoint_file = os.path.join(directory, "bucket_offsets.ckpt") + + if os.path.exists(self.checkpoint_file): + self.bucket_offsets = pickle.load(open(self.checkpoint_file, "rb")) + else: + self.bucket_offsets = [0 for i in range(len(self.buckets))] + + for i, offset in enumerate(self.bucket_offsets): + bucket = self.buckets[i] + bucket.fh.seek(offset) + bucket.fh.truncate() + + def add_data(self, key, value): + i = hash(key) % len(self.buckets) + bucket = self.buckets[i] + bucket.add_data(value) + + def save_checkpoint(self): + for bucket in self.buckets: + bucket.fh.flush() + + bucket_offsets = [bucket.fh.tell() for bucket in self.buckets] + pickle.dump(bucket_offsets, open(self.checkpoint_file, "wb")) + + def close_buckets(self): + for bucket in self.buckets: + bucket.commit() + + +def do_ngrams_in_buckets(n_value, working_directory, bucket_count): + pile_statistics = json.load(open("pile_statistics.json", "r", encoding="utf-8")) + pile_document_count = pile_statistics["Document Count"] + start_offsets = pile_statistics["File Start Offsets"] + + output_directory = os.path.join(working_directory, "output") + os.makedirs(output_directory, exist_ok=True) + + logger.info(f"Generating {n_value}-grams and bucketing.") + + # Done file + done_file = os.path.join(output_directory, "ngram_buckets.done") + if os.path.exists(done_file): + logger.info("ngrams already generated and bucketed, skipping") + return + + # Checkpoint + checkpoint_file = os.path.join(working_directory, "pile_offset.ckpt") + if os.path.exists(checkpoint_file): + checkpoint_offset = pickle.load(open(checkpoint_file, "rb")) + iterate = True + else: + checkpoint_offset = 0 + iterate = False + + logger.info(f"Starting at pile document index {checkpoint_offset}") + buckets = Buckets(output_directory, bucket_count) + + janitor = Janitor() + batch_size = 1000 + batch_counter = 0 + + with tqdm(total=checkpoint_offset, dynamic_ncols=True, unit="docs") as progress: + for offset, document in yield_pile(start_offsets, checkpoint_offset): + if iterate: + logger.info(f"Iterating to offset {checkpoint_offset} from {offset}") + progress.update(offset) + iterate = False + + if offset < checkpoint_offset: + progress.update() + + if terminate: + return + continue + + if offset == checkpoint_offset: + progress.reset(total=pile_document_count) + progress.update(checkpoint_offset) + + # Save checkpoint every "batch_size", only allow terminate after checkpoint + if batch_counter == batch_size: + progress.update(batch_size) + batch_counter = 0 + buckets.save_checkpoint() + pickle.dump(offset, open(checkpoint_file, "wb")) + if terminate: + buckets.close_buckets() + return + + ngrams = word_ngrams(janitor.normalize_string(document), n_value) + for ngram in ngrams: + buckets.add_data(ngram, f"{ngram} {offset}") + + batch_counter += 1 + + buckets.close_buckets() + Path(done_file).touch() + + +parser = argparse.ArgumentParser(description="Generate 13 grams from Pile.") +parser.add_argument("-dir", "--working_directory", default="") +parser.add_argument("-n", "--n_value", type=int, default=13) +parser.add_argument("-buckets", "--bucket_count", type=int, default=500) + +if __name__ == "__main__": + version = 1.00 + print(f"Running version {version}") + + if "PYTHONHASHSEED" not in os.environ or os.environ["PYTHONHASHSEED"] != "0": + print("Please run 'export PYTHONHASHSEED=0' before running generate.") + sys.exit() + + # Handle sigint (ctrl-c) cleanly + previous_signal_int = signal.signal(SIGINT, handler) + + logfile_path = "ngrams.log" + setup_logger_tqdm(logfile_path) + + args = parser.parse_args() + do_ngrams_in_buckets(args.n_value, args.working_directory, args.bucket_count) + + info_dict = {"title": "dataset ngrams", "ngram_size": 13} + info_dict_path = os.path.join(args.working_directory, "info.json") + json.dump(info_dict, open(info_dict_path, "w", encoding="utf-8")) diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/investigate_pile.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/investigate_pile.py new file mode 100644 index 0000000000000000000000000000000000000000..681b591ced535dbb884fb65f58a0c9042c35b0ac --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/investigate_pile.py @@ -0,0 +1,95 @@ +import glob +import json +import os +from functools import reduce + +import tqdm +from tqdm_multiprocess import TqdmMultiProcessPool + +from lm_eval.decontamination.archiver import Reader + + +def get_file_stats(file_path, tqdm_func, global_tqdm): + reader = Reader() + total_documents = 0 + total_size = 0 + update_frequency = 10000 + current_file_position = 0 + + with tqdm_func( + total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1 + ) as progress: + for document in reader.read(file_path, get_meta=True): + total_size += len(document) + total_documents += 1 + + if total_documents % update_frequency == 0: + new_file_pos = reader.fh.tell() + bytes_read = new_file_pos - current_file_position + current_file_position = new_file_pos + progress.update(bytes_read) + global_tqdm.update(bytes_read) + + return (total_documents, total_size) + + +def get_files(): + directory = "pile" + files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*")))) + print(files) + return files + + +def get_stats(): + files = get_files() + total_size_bytes = sum(map(lambda x: os.path.getsize(x), files)) + + pool = TqdmMultiProcessPool(4) + global_tqdm = tqdm.tqdm( + total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1 + ) + + # Generate minhashes with pool + tasks = [(get_file_stats, (file,)) for file in files] + + def on_done(_): + return None + + def on_error(_): + return None + + results = pool.map(global_tqdm, tasks, on_error, on_done) + + total_documents, total_size = reduce( + lambda x, y: (x[0] + y[0], x[1] + y[1]), results + ) + + start_offsets = [] + current_offset = 0 + for file_document_count, _ in results: + start_offsets.append(current_offset) + current_offset += file_document_count + + return (total_documents, total_size, start_offsets) + + +if __name__ == "__main__": + version = 1.01 + print(f"Running version {version}") + + stats_file_path = "pile_statistics.json" + if os.path.exists(stats_file_path): + stats = json.load(open(stats_file_path, "r", encoding="utf-8")) + else: + document_count, total_document_size_chars, start_offsets = get_stats() + stats = { + "Data": "Pile statistics", + "Document Count": document_count, + "Total Pile Characters": total_document_size_chars, + "File Start Offsets": start_offsets, + } + json.dump(stats, open(stats_file_path, "w", encoding="utf-8"), indent=4) + + print(f"document_count: {stats['Document Count']}") + print(f"total_chars: {stats['Total Pile Characters']}") + print(f"start_offsets: {stats['File Start Offsets']}") diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/janitor_util.cpp b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/janitor_util.cpp new file mode 100644 index 0000000000000000000000000000000000000000..858a8b20492507a6228a640cef0cc3ec7ac56bca --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/janitor_util.cpp @@ -0,0 +1,208 @@ +#include +#include +#include +#include +#include +#include +#include + +bool is_whitespace(char ch) noexcept { + // " \t\n\r\x0b\x0c" (python string.whitespace) + return ch == 32 or (9 <= ch and ch <= 13); + // return ch <= 32; // arguably too general, but slightly faster +} + +bool is_punctuation(char c) noexcept { + // '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' ascii values: 33-47, 58-64, + // 91-96, 123-126 + return (33 <= c and c <= 47) or (58 <= c and c <= 64) or + (91 <= c and c <= 96) or (123 <= c and c <= 126); +} + +// Takes a string and makes ngrams of length N, splitting grams on whitespace +// and ignoring ignored characters Returns a LARGE array of ngrams +std::vector clean_ngram(std::string const &input, + std::string const &ignore, + size_t ngram_n) noexcept { + + size_t num_grams = 0; + std::vector ngram_list; + std::vector gram_lengths; + std::string current_ngram; + + // Max gram length is set to 10 below. + current_ngram.reserve(11 * ngram_n); + gram_lengths.reserve(ngram_n); + + bool started_gram = false; + gram_lengths.push_back(0); + + // for (size_t i=0; i 10) { + + // Skip all whitespace + while (++iter != input.end() && is_whitespace(*iter)) + ; + iter--; + + if (started_gram) { + num_grams += 1; + + // Building 1grams is a special case + if (ngram_n == 1) { + ngram_list.push_back(current_ngram); + current_ngram = current_ngram.substr(gram_lengths.front()); + gram_lengths.back() = 0; + + // If there are enough grams to form an ngram, save + } else if (num_grams >= ngram_n) { + // Save the current ngram + ngram_list.push_back(current_ngram); + + // Start the next ngram by dropping the first gram and its space from + // the ngram + current_ngram = current_ngram.substr(gram_lengths.front() + 1); + current_ngram += ' '; + + // Drop the length of the first gram and prepare to record the length + // of the new gram + gram_lengths.erase(gram_lengths.begin()); + gram_lengths.push_back(0); + + // Otherwise, continue building + } else { + current_ngram += ' '; + gram_lengths.push_back(0); + } + + started_gram = false; + } + + // Skip ignored characters + // alternatively, (perhaps marginally) faster: if (is_punctuation(ch)) + // continue; + } else if (ignore.find(*iter) != std::string::npos) { + continue; + } + + // If it is a non-ignored character, add it to the ngram and update the last + // gram's length + else { + current_ngram += tolower(*iter); + gram_lengths.back() += 1; + started_gram = true; + } + } + + return ngram_list; +} + +// Takes a string and makes ngrams of length N, splitting grams on whitespace +// and ignoring ignored characters Returns a LARGE array of tuples of (ngram, +// start_idx, end_idx) +std::vector> +clean_ngram_with_indices(std::string const &input, std::string const &ignore, + size_t ngram_n) noexcept { + + size_t num_grams = 0; + std::vector> ngram_list; + std::vector gram_lengths; + std::vector gram_start_indices; + std::string current_ngram; + + // Max gram length is set to 10 below. + current_ngram.reserve(11 * ngram_n); + + bool started_gram = false; + gram_lengths.push_back(0); + gram_start_indices.push_back(0); + + for (size_t i = 0; i < input.length(); i++) { + char ch = input[i]; + + // If whitespace, end the current ngram and start the next + if (is_whitespace(ch) || gram_lengths.back() > 10) { + + // Skip all whitespace + while (++i < input.length() && is_whitespace(input[i])) + ; + i--; + + if (started_gram) { + num_grams += 1; + + // Building 1grams is a special case + if (ngram_n == 1) { + ngram_list.push_back( + std::make_tuple(current_ngram, gram_start_indices.front(), i)); + current_ngram = current_ngram.substr(gram_lengths.front()); + gram_lengths.back() = 0; + gram_start_indices.back() = i + 1; + + // If there are enough grams to form an ngram, save + } else if (num_grams >= ngram_n) { + + // Save the current ngram + ngram_list.push_back( + std::make_tuple(current_ngram, gram_start_indices.front(), i)); + + // Start the next ngram by dropping the first gram and its space from + // the ngram + current_ngram = current_ngram.substr(gram_lengths.front() + 1); + current_ngram += ' '; + + // Drop the length of the first gram and prepare to record the length + // of the new gram + gram_lengths.erase(gram_lengths.begin()); + gram_lengths.push_back(0); + + gram_start_indices.erase(gram_start_indices.begin()); + gram_start_indices.push_back(i + 1); + + // Otherwise, continue building + } else { + current_ngram += ' '; + gram_lengths.push_back(0); + gram_start_indices.push_back(i + 1); + } + + started_gram = false; + } + + // Skip ignored characters + } else if (ignore.find(ch) != std::string::npos) { + continue; + + // If it is a non-ignored character, add it to the ngram and update the + // last gram's length + } else { + current_ngram += tolower(ch); + gram_lengths.back() += 1; + started_gram = true; + } + } + + return ngram_list; +} + +PYBIND11_MODULE(janitor_util, m) { + m.doc() = "pybind11 example plugin"; // optional module docstring + // m.def("add", &add, "A function which adds two numbers"); // example + // function + m.def("clean_ngram", &clean_ngram, + "Create ngrams of words, ignoring some characters"); + m.def("clean_ngram_with_indices", &clean_ngram_with_indices, + "Create ngrams of words with indices, ignoring some characters"); +} + +// Example compile +// c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) +// janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) If +// python and gcc aren't linked, append to the above: -undefined +// dynamic_lookup diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/process_sorted_buckets.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/process_sorted_buckets.py new file mode 100644 index 0000000000000000000000000000000000000000..9d345d8e86f409495b95a73f4539b2f4df57af70 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/process_sorted_buckets.py @@ -0,0 +1,129 @@ +""" +Processes each sorted bucket, creating a new file listing all ngrams that matched more then 10 +unique documents with their unique document counts. Uses multiprocessing and very little memory +as we stream from presorted buckets. Will use a lot of disk though. + +Arguments +--------- +--working_directory (-dir) + Directory containing the sorted buckets, processed files will be deposited here. Default: current directory +--move_dir (-move) + Directory to move processed 13grams too. Default: Do nothing +--process_count (-procs) + Number of processes to use. Default: 4 +""" + +import argparse +import glob +import logging +import os +import re +import shutil +from pathlib import Path + +from tqdm import tqdm +from tqdm_multiprocess import TqdmMultiProcessPool +from tqdm_multiprocess.logger import setup_logger_tqdm + +from scripts.clean_training_data.archiver import TextArchive, TextReader + + +logger = logging.getLogger(__name__) + + +# Multiprocessed +def process_bucket( + bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm +): + bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path)) # noqa: W605 + done_file = os.path.join( + processed_directory, f"ngram_bucket_processing_{bucket_id}.done" + ) + if os.path.exists(done_file): + logger.info(f"bucket {bucket_id} already processed, skipping") + return + + # For managing tqdm + file_size = os.path.getsize(bucket_file_path) + bucket_progress = tqdm_func( + total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1 + ) + current_file_position = 0 + update_frequency = 100 * 1000000 # 100mb + update_counter = 0 + + # Iterate through and output ngrams which occur in more then 10 documents + bucket = TextReader(bucket_file_path) + + output_file_path = bucket_file_path + ".processed" + output_archive = TextArchive(output_file_path, mode="wb") + + current_ngram = "" + current_ngram_document_ids = set() + for line in bucket.read(): + [ngram, document_id] = line.rsplit(" ", 1) + + # Write ngram if more then 10 unique document occurrences + if ngram != current_ngram: + if len(current_ngram_document_ids) > 10: + output_archive.add_data( + f"{current_ngram} {len(current_ngram_document_ids)}" + ) + current_ngram = ngram + current_ngram_document_ids = set() + + current_ngram_document_ids.add(document_id) + + # Update tqdm + update_counter += bucket.fh.tell() - current_file_position + current_file_position = bucket.fh.tell() + if update_counter > update_frequency: + bucket_progress.update(update_counter) + update_counter = 0 + + # Remainder + if len(current_ngram_document_ids) > 10: + output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}") + + output_archive.commit() + Path(done_file).touch() + + if move_dir: + shutil.move(output_file_path, move_dir) + + global_tqdm.update() + + +def process_sorted_buckets(working_directory, move_dir, process_count): + bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt.sorted")) + processed_directory = os.path.join(working_directory, "processed") + os.makedirs(processed_directory, exist_ok=True) + + pool = TqdmMultiProcessPool(process_count) + tasks = [ + (process_bucket, (bucket_file, processed_directory, move_dir)) + for bucket_file in bucket_file_paths + ] + + global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket") + + def on_done(_): + return None + + def on_error(_): + return None + + _ = pool.map(global_tqdm, tasks, on_error, on_done) + + +parser = argparse.ArgumentParser(description="Process 13 grams from sorted buckets.") +parser.add_argument("-dir", "--working_directory", default="") +parser.add_argument("-move", "--move_dir", default="") +parser.add_argument("-procs", "--process_count", type=int, default=4) + +if __name__ == "__main__": + logfile_path = "process13grams.log" + setup_logger_tqdm(logfile_path) + + args = parser.parse_args() + process_sorted_buckets(args.working_directory, args.move_dir, args.process_count) diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/sort_13_gram_buckets.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/sort_13_gram_buckets.py new file mode 100644 index 0000000000000000000000000000000000000000..83990de822e333bcd16c8d8092aec7ce41ff4e94 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/sort_13_gram_buckets.py @@ -0,0 +1,62 @@ +""" +Iteratively runs gnu sort on each bucket, uses up to 8 cores. + +Arguments +--------- +--working_directory (-dir) + Directory containing the bucketed 13-grams. Sorted buckets will be deposited in the same + directory and the unsorted buckets are removed after. +""" + +import argparse +import glob +import logging +import os +import signal +import subprocess +from signal import SIGINT + +from tqdm import tqdm +from tqdm_multiprocess.logger import setup_logger_tqdm + + +logger = logging.getLogger(__name__) + +terminate = False + + +def handler(signal_received, frame): + global terminate + terminate = True + + +def sort_13_gram_buckets(working_directory): + bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt")) + + for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True): + sorted_file_path = bucket_file_path + ".sorted" + command = f"sort {bucket_file_path} > {sorted_file_path}" + logger.info(command) + subprocess.call(command, shell=True) + + if terminate: + return + + os.remove(bucket_file_path) + + +parser = argparse.ArgumentParser(description="sort 13gram buckets") +parser.add_argument("-dir", "--working_directory", default="") + +if __name__ == "__main__": + version = 1.00 + print(f"Running version {version}") + + # Handle sigint (ctrl-c) cleanly + previous_signal_int = signal.signal(SIGINT, handler) + + logfile_path = "sort13grambuckets.log" + setup_logger_tqdm(logfile_path) + + args = parser.parse_args() + sort_13_gram_buckets(args.working_directory) diff --git a/scripts/yans/lm-evaluation-harness/scripts/cost_estimate.py b/scripts/yans/lm-evaluation-harness/scripts/cost_estimate.py new file mode 100644 index 0000000000000000000000000000000000000000..baf81147547b0a7a92e52904c70cb11d246f680b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/cost_estimate.py @@ -0,0 +1,99 @@ +import random + +import transformers + +from lm_eval import evaluator, tasks +from lm_eval.api.model import LM + + +class DryrunLM(LM): + def __init__(self): + self.tokencost = 0 + self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2") + self.tokenizer.pad_token = "<|endoftext|>" + + @classmethod + def create_from_arg_string(cls, arg_string): + return cls() + + def loglikelihood(self, requests): + res = [] + + for ctx, cont in requests: + res.append((-random.random(), False)) + self.tokencost += len(self.tokenizer.tokenize(ctx + cont)) + + return res + + def generate_until(self, requests): + res = [] + + for ctx, _ in requests: + res.append("lol") + + # assume worst case - generates until 256 + self.tokencost += len(self.tokenizer.tokenize(ctx)) + 256 + + return res + + def loglikelihood_rolling(self, requests): + res = [] + + for (s,) in requests: + # assume worst case: extra full context + self.tokencost += len(self.tokenizer.tokenize(s)) + 2048 + + return res + + +def main(): + lm = DryrunLM() + + task_list = "arc_challenge,arc_easy,boolq,cola,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,record,rte,sciq,sst,triviaqa,webqs,wic,wikitext,winogrande,wnli,wsc" + values = [] + for taskname in task_list.split(","): + lm.tokencost = 0 + evaluator.simple_evaluate( + lm=lm, + task_dict={taskname: tasks.get_task(taskname)()}, + num_fewshot=0, + limit=None, + bootstrap_iters=10, + ) + + print(taskname, lm.tokencost) + values.append( + [ + taskname, + lm.tokencost, + lm.tokencost / 1000 * 0.0008, + lm.tokencost / 1000 * 0.0012, + lm.tokencost / 1000 * 0.006, + lm.tokencost / 1000 * 0.06, + ] + ) + from pytablewriter import MarkdownTableWriter + + writer = MarkdownTableWriter() + writer.headers = ["Task", "Tokens", "Ada", "Babbage", "Curie", "Davinci"] + + values.sort(key=lambda x: -x[1]) + totcost = sum([x[1] for x in values]) + values.append( + [ + "**Total**", + totcost, + totcost / 1000 * 0.0008, + totcost / 1000 * 0.0012, + totcost / 1000 * 0.006, + totcost / 1000 * 0.06, + ] + ) + + writer.value_matrix = values + + print(writer.dumps()) + + +if __name__ == "__main__": + main() diff --git a/scripts/yans/lm-evaluation-harness/scripts/get_prompts.py b/scripts/yans/lm-evaluation-harness/scripts/get_prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..d262ec37e40f229c2009f9f162cc58834291de12 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/get_prompts.py @@ -0,0 +1,25 @@ +from itertools import islice + +from lm_eval import tasks + + +ct = 3 + +for ( + tname, + Task, +) in tasks.TASK_REGISTRY.items(): # [('record', tasks.superglue.ReCoRD)]:# + task = Task() + + print("#", tname) + docs = islice( + task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct + ) + print() + for i in range(ct): + print() + doc = next(docs) + print("**Context**:", "\n```\n" + task.doc_to_text(doc) + "\n```\n") + print() + print("**Target**:", "\n```\n" + task.doc_to_target(doc) + "\n```\n") + print() diff --git a/scripts/yans/lm-evaluation-harness/scripts/make_gpt2_test_cases.py b/scripts/yans/lm-evaluation-harness/scripts/make_gpt2_test_cases.py new file mode 100644 index 0000000000000000000000000000000000000000..0c1a4bffe03ef057c331dc9a20c0a5eadb46be66 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/make_gpt2_test_cases.py @@ -0,0 +1,48 @@ +import random + +import torch +import torch.nn.functional as F +import transformers + + +random.seed(42) + + +data = [ + "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)", + "The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons (with threshold activation); see § Terminology", + 'Multilayer perceptrons are sometimes colloquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]', + "An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear activation function.", + "MLP utilizes a supervised learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]", + "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. ", + "Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.", + "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)", + "Hello World", +] + + +model = transformers.GPT2LMHeadModel.from_pretrained("gpt2") +tok = transformers.GPT2Tokenizer.from_pretrained("gpt2") + +tgs = [] + +for dat in data: + random.seed(dat) + # print(model(tok.encode(dat, return_tensors="pt"))[0][0]) + + toks = tok.encode(dat, return_tensors="pt") + ind = random.randrange(len(toks[0]) - 1) + logits = F.log_softmax(model(toks)[0], dim=-1)[:, :-1] # [batch, seq, vocab] + + res = torch.gather(logits, 2, toks[:, 1:].unsqueeze(-1)).squeeze(-1)[0] + + tgs.append(float(res[ind:].sum())) + print( + r'("""' + + tok.decode(toks[0, : ind + 1]) + + r'""", """' + + tok.decode(toks[0, ind + 1 :]) + + r'"""), ' + ) + +print(tgs) diff --git a/scripts/yans/lm-evaluation-harness/scripts/make_table_results.py b/scripts/yans/lm-evaluation-harness/scripts/make_table_results.py new file mode 100644 index 0000000000000000000000000000000000000000..59eddb4a4fdac05c1d2ce3623a7bd4312101bec2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/make_table_results.py @@ -0,0 +1,75 @@ +""" +Usage: + python make_table_tasks.py --output +""" + +import json +import logging +import os + +from pytablewriter import LatexTableWriter, MarkdownTableWriter + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def make_table(result_dict): + """Generate table of results.""" + md_writer = MarkdownTableWriter() + latex_writer = LatexTableWriter() + md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] + latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] + + values = [] + + for k, dic in sorted(result_dict["results"].items()): + version = result_dict["versions"][k] + percent = k == "squad2" + for m, v in dic.items(): + if m.endswith("_stderr"): + continue + + if m + "_stderr" in dic: + se = dic[m + "_stderr"] + if percent or m == "ppl": + values.append([k, version, m, "%.2f" % v, "±", "%.2f" % se]) + else: + values.append( + [k, version, m, "%.2f" % (v * 100), "±", "%.2f" % (se * 100)] + ) + else: + if percent or m == "ppl": + values.append([k, version, m, "%.2f" % v, "", ""]) + else: + values.append([k, version, m, "%.2f" % (v * 100), "", ""]) + k = "" + version = "" + md_writer.value_matrix = values + latex_writer.value_matrix = values + + # todo: make latex table look good + # print(latex_writer.dumps()) + + return md_writer.dumps() + + +if __name__ == "__main__": + # loop dirs and subdirs in results dir + # for each dir, load json files + for dirpath, dirnames, filenames in os.walk("../results"): + # skip dirs without files + if not filenames: + continue + path_readme = os.path.join(dirpath, "README.md") + with open(path_readme, "w", encoding="utf-8") as f: + # get path name, only last folder + path_name = dirpath.split("/")[-1] + f.write(f"# {path_name} \n\n") + for filename in sorted([f for f in filenames if f.endswith(".json")]): + path = os.path.join(dirpath, filename) + with open(path, "r", encoding="utf-8") as f: + result_dict = json.load(f) + with open(path_readme, "a", encoding="utf-8") as f: + f.write(f"## {filename} \n") + f.write(f"{make_table(result_dict)} \n") diff --git a/scripts/yans/lm-evaluation-harness/scripts/make_table_tasks.py b/scripts/yans/lm-evaluation-harness/scripts/make_table_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..8a3b19634b11eb9974a32dcd2a80cab3f0940f9e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/make_table_tasks.py @@ -0,0 +1,55 @@ +""" +Usage: + python make_table_tasks.py --output +""" + +import argparse +import logging + +from pytablewriter import MarkdownTableWriter + +from lm_eval import tasks + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def check(tf): + if tf: + return "✓" + else: + return " " + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--output", type=str, default="task_table.md") + args = parser.parse_args() + + writer = MarkdownTableWriter() + writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"] + values = [] + + tasks = tasks.TASK_REGISTRY.items() + tasks = sorted(tasks, key=lambda x: x[0]) + for tname, Task in tasks: + task = Task() + v = [ + tname, + check(task.has_training_docs()), + check(task.has_validation_docs()), + check(task.has_test_docs()), + len( + list( + task.test_docs() if task.has_test_docs() else task.validation_docs() + ) + ), + ", ".join(task.aggregation().keys()), + ] + logger.info(v) + values.append(v) + writer.value_matrix = values + table = writer.dumps() + with open(args.output, "w", encoding="utf-8") as f: + f.write(table) diff --git a/scripts/yans/lm-evaluation-harness/scripts/model_comparator.py b/scripts/yans/lm-evaluation-harness/scripts/model_comparator.py new file mode 100644 index 0000000000000000000000000000000000000000..55f4f3b15468b2f46e590cbfd82d7902f1d9a16f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/model_comparator.py @@ -0,0 +1,139 @@ +import argparse +import os +from typing import Dict, List, Tuple + +import numpy as np +import pandas as pd +import torch + +import lm_eval.evaluator +import lm_eval.models.utils +from lm_eval import tasks, utils + + +os.environ["TOKENIZERS_PARALLELISM"] = "false" +eval_logger = utils.eval_logger + + +def memory_stats(): + eval_logger.info( + f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}" + ) + + +def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]: + from scipy.stats.norm import sf + + acc1, acc2 = res1["acc,none"], res2["acc,none"] + st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"] + Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2)) + # Determining the p-value + p_value = 2 * sf(abs(Z)) # two-tailed test + return Z, p_value + + +def print_results( + data_to_print: List = None, results_dict: Dict = None, alpha: float = None +): + model1_data = data_to_print[0] + model2_data = data_to_print[1] + table_data = [] + for task in model1_data.keys(): + row = { + "Task": task, + "HF Accuracy": model1_data[task]["acc,none"], + "vLLM Accuracy": model2_data[task]["acc,none"], + "HF StdErr": model1_data[task]["acc_stderr,none"], + "vLLM StdErr": model2_data[task]["acc_stderr,none"], + } + table_data.append(row) + comparison_df = pd.DataFrame(table_data) + comparison_df["Z-Score"] = comparison_df["Task"].apply( + lambda task: results_dict[task]["z"] + ) + comparison_df["P-Value"] = comparison_df["Task"].apply( + lambda task: results_dict[task]["p_value"] + ) + comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply( + lambda p: "✓" if p > alpha else "×" + ) + return comparison_df + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare" + ) + parser.add_argument( + "--hf_args", help="huggingface model args =", default="" + ) + parser.add_argument("--vllm_args", help="vllm model args =", default="") + parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag") + parser.add_argument( + "--limit", + type=float, + default=100, + ) + parser.add_argument( + "--alpha", + type=float, + default=0.05, + help="Significance level for two-tailed z-test", + ) + parser.add_argument( + "--device", + type=str, + default="cuda", + ) + parser.add_argument( + "--batch", + type=str, + default=8, + ) + parser.add_argument( + "--verbosity", + type=str, + default="INFO", + help="Logging verbosity", + ) + return parser.parse_args() + + +if __name__ == "__main__": + tasks.initialize_tasks() + args = parse_args() + tasks = args.tasks.split(",") + print(tasks) + hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args + results_vllm = lm_eval.evaluator.simple_evaluate( + model="vllm", + model_args=f"pretrained={args.pretrained}" + vllm_args, + tasks=tasks, + limit=args.limit, + device=args.device, + batch_size=args.batch, + ) + memory_stats() + lm_eval.models.utils.clear_torch_cache() + eval_logger.info("Memory stats cleared") + memory_stats() + results_hf = lm_eval.evaluator.simple_evaluate( + model="hf", + model_args=f"pretrained={args.pretrained}" + hf_args, + tasks=tasks, + limit=args.limit, + device=args.device, + batch_size=args.batch, + ) + all_res = {} + for task1, task2 in zip( + results_hf["results"].items(), results_vllm["results"].items() + ): + assert task1[0] == task2[0] + z, p_value = calculate_z_value(task1[1], task2[1]) + all_res[task1[0]] = {"z": z, "p_value": p_value} + df = print_results( + [results_hf["results"], results_vllm["results"]], all_res, args.alpha + ) + print(df) diff --git a/scripts/yans/lm-evaluation-harness/scripts/regression.py b/scripts/yans/lm-evaluation-harness/scripts/regression.py new file mode 100644 index 0000000000000000000000000000000000000000..75258dcb640a4f32a0011e864d390e9619f6e2e3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/regression.py @@ -0,0 +1,199 @@ +import argparse +import json +import os +import subprocess +import time +from pathlib import Path + +from lm_eval import utils +from lm_eval.api.registry import ALL_TASKS + + +seq2seq_models = ["google/flan-t5-small"] +causal_models = [ + "gpt2", + "facebook/opt-125m", + "EleutherAI/gpt-neo-125m", + "EleutherAI/pythia-160m", +] +model_names = seq2seq_models + causal_models + + +completion_tasks = ["boolq", "lambada_openai", "winogrande"] +choice_tasks = ["hellaswag", "openbookqa", "piqa"] +perplexity_tasks = ["wikitext"] +generation_tasks = [] +task_names = completion_tasks + choice_tasks + perplexity_tasks + generation_tasks + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--branches", default=[]) + parser.add_argument("--models", default=model_names) + parser.add_argument("--tasks", default=task_names) + parser.add_argument("--acc_norm", type=bool, default=False) + parser.add_argument("--perplexity", default=None) + # TODO: implement num_fewshot and limit per task, e.g. task1:5,task2:1:100,task3::1000 + parser.add_argument("--num_fewshot", type=int, default=0) + parser.add_argument("--limit", type=float, default=None) + # TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this + parser.add_argument("--model", default="hf-causal") + # Use whatever is faster here + parser.add_argument("--model_args", default="use_accelerate=True,load_in_8bit=True") + parser.add_argument("--batch_size", default="auto") + return parser.parse_args() + + +def eval_models(args, branch=None): + if branch is not None: + if os.system(f"git checkout {branch}") != 0: + return {}, 0 + + branch = branch or initial_branch + + start_time = time.time() + + results = {} + + for model in args.models: + model_type = ( + "hf-causal" + if model in causal_models + else "hf-seq2seq" + if model in seq2seq_models + else args.model + ) + model_args = f"pretrained={model},{args.model_args}" + # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527 + tasks = ( + args.tasks + if model in causal_models or model_type == "hf-causal" + else list(filter(lambda task: task not in perplexity_tasks, args.tasks)) + ) + # TODO: OOM with auto for seq2seq models, also can OOM with llama + batch_size = ( + args.batch_size + if model in causal_models or model_type == "hf-causal" + else 64 + if args.batch_size == "auto" + else args.batch_size + ) + output_path = ( + f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json" + ) + + command = ( + f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " + f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " + f"--batch_size {batch_size} --no_cache --output_path {output_path}" + ) + + print( + f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}" + ) + + ret = os.system(command) + + results[model] = ( + json.load(open(output_path, encoding="utf-8")) + if ret == 0 + else {"results": {}} + ) + + end_time = time.time() + + return results, end_time - start_time + + +def extract_value(args, results, model, task, err=False): + if model not in results: + return 0 + results = results[model]["results"] + if task not in results: + return 0 + results = results[task] + if args.acc_norm and "acc_norm,none" in results: + return results["acc_norm,none"] if not err else results["acc_norm_stderr,none"] + if "acc,none" in results: + return results["acc,none"] if not err else results["acc_stderr,none"] + if (args.perplexity or "word_perplexity") + ",none" in results: + return ( + results[(args.perplexity or "word_perplexity") + ",none"] if not err else 0 + ) + return 0 + + +def format_value(args, results, model, task): + val = 100 * extract_value(args, results, model, task) + err = 100 * extract_value(args, results, model, task, err=True) + return f"{val:.2f}{f' ± {err:.2f}' if err != 0 else ''}" + + +def format_diff(args, results1, results2, model, task): + val1 = 100 * extract_value(args, results1, model, task) + val2 = 100 * extract_value(args, results2, model, task) + diff = val2 - val1 + return f"**+{diff:.2f}**" if diff > 0 else f"{diff:.2f}" + + +def main(): + args = parse_args() + + args.branches = ( + args.branches.split(",") if isinstance(args.branches, str) else args.branches + ) + args.models = ( + args.models.split(",") if isinstance(args.models, str) else args.models + ) + args.tasks = ( + ALL_TASKS + if args.tasks == "all_tasks" + else utils.pattern_match(args.tasks.split(","), ALL_TASKS) + if isinstance(args.tasks, str) + else args.tasks + ) + + global initial_branch + initial_branch = ( + subprocess.check_output("git branch --show-current", shell=True) + .decode("ascii") + .strip() + ) + + # TODO: implement proper timing for each task + # TODO: reduce IO by sharing tasks between models? + + results, runtime = eval_models(args) + print(results, runtime) + + runs = [] + for branch in args.branches: + runs.append((branch, *eval_models(args, branch))) + + os.system(f"git checkout {initial_branch}") + + print("") + print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|") + print(f"|--|{'--|' * len(args.models)}") + for task in args.tasks: + print( + f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|" + ) + for branch, branch_results, branch_runtime in runs: + print( + f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|" + ) + print( + f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|" + ) + + print("") + print("|branch|runtime|%|") + print("|--|--|--|") + print(f"|{initial_branch}|{runtime:.1f}s|100%|") + for branch, _, branch_runtime in runs: + print(f"|{branch}|{branch_runtime:.1f}s|{100 * branch_runtime / runtime:.2f}%|") + + +if __name__ == "__main__": + main() diff --git a/scripts/yans/lm-evaluation-harness/scripts/requests_caching.py b/scripts/yans/lm-evaluation-harness/scripts/requests_caching.py new file mode 100644 index 0000000000000000000000000000000000000000..2aaf323485606c61b435fe0f3ab5a6c97b5561b5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/requests_caching.py @@ -0,0 +1,92 @@ +""" +Usage: + python requests_caching.py --tasks=comma,separated,list,of,tasks --cache_requests= +""" + +import argparse +import os +from typing import List + +import torch +from transformers import ( + pipeline as trans_pipeline, +) + +from lm_eval import simple_evaluate +from lm_eval.evaluator import request_caching_arg_to_dict +from lm_eval.utils import eval_logger + + +MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) + +# Used to specify alternate cache path, useful if run in a docker container +# NOTE raw datasets will break if you try to transfer the cache from your host to a docker image +LM_HARNESS_CACHE_PATH = os.getenv("LM_HARNESS_CACHE_PATH") + + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +MODEL = "EleutherAI/pythia-70m" + +TASK = "text-generation" + + +def run_model_for_task_caching(tasks: List[str], cache_requests: str): + eval_logger.info(f"Loading HF model: {MODEL}") + + trans_pipe = trans_pipeline( + task=TASK, model=MODEL, device=DEVICE, trust_remote_code=True + ) + + model = trans_pipe.model + tokenizer = trans_pipe.tokenizer + + eval_logger.info( + f"Running simple_evaluate to cache request objects for tasks: {tasks}" + ) + + cache_args = request_caching_arg_to_dict(cache_requests=cache_requests) + + eval_logger.info( + f"The following operations will be performed on the cache: {cache_requests}" + ) + + eval_data = simple_evaluate( + model="hf-auto", + model_args={ + "pretrained": model, + "tokenizer": tokenizer, + }, + limit=1, + device=DEVICE, + tasks=tasks, + write_out=True, + **cache_args, + ) + + return eval_data + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--tasks", + "-t", + default=None, + metavar="task1,task2", + ) + parser.add_argument( + "--cache_requests", + type=str, + default=None, + choices=["true", "refresh", "delete"], + help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.", + ) + + args = parser.parse_args() + + tasks = args.tasks.split(",") + + eval_data = run_model_for_task_caching( + tasks=tasks, model=MODEL, device=DEVICE, cache_requests=args.cache_requests + ) diff --git a/scripts/yans/lm-evaluation-harness/scripts/write_out.py b/scripts/yans/lm-evaluation-harness/scripts/write_out.py new file mode 100644 index 0000000000000000000000000000000000000000..6ff5a4304ed7798f8e375abeb8a5f30cb2aedcea --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/write_out.py @@ -0,0 +1,97 @@ +import argparse +import os +import random + +import numpy as np + +from lm_eval import tasks +from lm_eval.tasks import TaskManager +from lm_eval.utils import eval_logger, join_iters + + +EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n" + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--output_base_path", "--output_path", required=True) + parser.add_argument("--tasks", default="all_tasks") + parser.add_argument("--sets", type=str, default="val") # example: val,test + parser.add_argument("--num_fewshot", type=int, default=1) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--num_examples", type=int, default=1) + parser.add_argument( + "--include_path", + type=str, + default=None, + help="Additional path to include if there are external tasks to include.", + ) + parser.add_argument( + "--verbosity", + type=str, + default="INFO", + help="Log error when tasks are not registered.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + np.random.seed(args.seed) + + if args.include_path is not None: + eval_logger.info(f"Including path: {args.include_path}") + + task_manager = TaskManager(args.verbosity, include_path=args.include_path) + + if args.tasks == "all_tasks": + task_names = task_manager.all_tasks + else: + task_names = args.tasks.split(",") + task_dict = tasks.get_task_dict(task_names, task_manager) + + os.makedirs(args.output_base_path, exist_ok=True) + for task_name, task in task_dict.items(): + if isinstance(task, tuple): + _, task = task + rnd = random.Random() + rnd.seed(args.seed) + + iters = [] + + for set in args.sets.split(","): + docs = None + if set == "train" and task.has_training_docs(): + docs = task.training_docs() + if set == "val" and task.has_validation_docs(): + docs = task.validation_docs() + if set == "test" and task.has_test_docs(): + docs = task.test_docs() + if docs is not None: + iters.append(docs) + + if len(iters) == 0: + raise ValueError( + f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value." + ) + + docs = join_iters(iters) + + with open( + os.path.join(args.output_base_path, task_name), "w", encoding="utf8" + ) as f: + for i, doc in ( + zip(range(args.num_examples), docs) + if args.num_examples > 0 + else enumerate(docs) + ): + f.write(EXAMPLE_DIVIDER.format(i=i)) + ctx = task.fewshot_context( + doc=doc, + num_fewshot=args.num_fewshot, + ) + f.write(ctx + "\n") + + +if __name__ == "__main__": + main() diff --git a/scripts/yans/lm-evaluation-harness/scripts/zeno_visualize.py b/scripts/yans/lm-evaluation-harness/scripts/zeno_visualize.py new file mode 100644 index 0000000000000000000000000000000000000000..f2772a235579b64cb05353a950a716e104a44cb2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/scripts/zeno_visualize.py @@ -0,0 +1,242 @@ +import argparse +import json +import os +import re +from pathlib import Path + +import pandas as pd +from zeno_client import ZenoClient, ZenoMetric + +from lm_eval.utils import ( + eval_logger, + get_latest_filename, + get_results_filenames, + get_sample_results_filenames, +) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk." + ) + parser.add_argument( + "--data_path", + required=True, + help="Where to find the results of the benchmarks that have been run. Uses the name of each subfolder as the model name.", + ) + parser.add_argument( + "--project_name", + required=True, + help="The name of the generated Zeno project.", + ) + return parser.parse_args() + + +def main(): + """Upload the results of your benchmark tasks to the Zeno AI evaluation platform. + + This scripts expects your results to live in a data folder where subfolders contain results of individual models. + """ + args = parse_args() + + client = ZenoClient(os.environ["ZENO_API_KEY"]) + + # Get all model subfolders from the parent data folder. + models = [ + os.path.basename(os.path.normpath(f)) + for f in os.scandir(Path(args.data_path)) + if f.is_dir() + ] + + assert len(models) > 0, "No model directories found in the data_path." + + # Get the tasks from the latest results file of the first model. + tasks = set(tasks_for_model(models[0], args.data_path)) + + # Get tasks names from the latest results file for each model + # Get intersection of tasks for all models + for model in models: + old_tasks = tasks.copy() + task_count = len(tasks) + model_tasks = set(tasks_for_model(model, args.data_path)) + tasks.intersection(set(model_tasks)) + + if task_count != len(tasks): + eval_logger.warning( + f"All models must have the same tasks. {model} has tasks: {model_tasks} but have already recorded tasks: {old_tasks}. Taking intersection {tasks}" + ) + + assert ( + len(tasks) > 0 + ), "Must provide at least one task in common amongst models to compare." + + for task in tasks: + # Upload data for all models + for model_index, model in enumerate(models): + # Get latest results and sample results for a model + model_dir = Path(args.data_path, model) + model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()] + model_results_filenames = get_results_filenames(model_files) + model_sample_filenames = get_sample_results_filenames(model_files) + latest_results = get_latest_filename( + [Path(f).name for f in model_results_filenames] + ) + latest_sample_results = get_latest_filename( + [Path(f).name for f in model_sample_filenames if task in f] + ) + model_args = re.sub( + r"[\"<>:/\|\\?\*\[\]]+", + "__", + json.load( + open(Path(args.data_path, model, latest_results), encoding="utf-8") + )["config"]["model_args"], + ) + print(model_args) + data = [] + with open( + Path(args.data_path, model, latest_sample_results), + "r", + encoding="utf-8", + ) as file: + for line in file: + data.append(json.loads(line.strip())) + + configs = json.load( + open(Path(args.data_path, model, latest_results), encoding="utf-8") + )["configs"] + config = configs[task] + + if model_index == 0: # Only need to assemble data for the first model + metrics = [] + for metric in config["metric_list"]: + metrics.append( + ZenoMetric( + name=metric["metric"], + type="mean", + columns=[metric["metric"]], + ) + ) + project = client.create_project( + name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""), + view="text-classification", + metrics=metrics, + ) + project.upload_dataset( + generate_dataset(data, config), + id_column="id", + data_column="data", + label_column="labels", + ) + + project.upload_system( + generate_system_df(data, config), + name=model, + id_column="id", + output_column="output", + ) + + +def tasks_for_model(model: str, data_path: str): + """Get the tasks for a specific model. + + Args: + model (str): The name of the model. + data_path (str): The path to the data. + + Returns: + list: A list of tasks for the model. + """ + # get latest model results for a given name + model_dir = Path(data_path, model) + model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()] + model_results_filenames = get_results_filenames(model_files) + latest_results = get_latest_filename(model_results_filenames) + config = (json.load(open(latest_results, encoding="utf-8"))["configs"],) + return list(config[0].keys()) + + +def generate_dataset( + data, + config, +): + """Generate a Zeno dataset from evaluation data. + + Args: + data: The data to generate a dataset for. + config: The configuration of the task. + + Returns: + pd.Dataframe: A dataframe that is ready to be uploaded to Zeno. + """ + ids = [x["doc_id"] for x in data] + labels = [x["target"] for x in data] + instance = [""] * len(ids) + + if config["output_type"] == "loglikelihood": + instance = [x["arguments"][0][0] for x in data] + labels = [x["arguments"][0][1] for x in data] + elif config["output_type"] == "multiple_choice": + instance = [ + x["arguments"][0][0] + + "\n\n" + + "\n".join([f"- {y[1]}" for y in x["arguments"]]) + for x in data + ] + elif config["output_type"] == "loglikelihood_rolling": + instance = [x["arguments"][0][0] for x in data] + elif config["output_type"] == "generate_until": + instance = [x["arguments"][0][0] for x in data] + + return pd.DataFrame( + { + "id": ids, + "data": instance, + "input_len": [len(x) for x in instance], + "labels": labels, + "output_type": config["output_type"], + } + ) + + +def generate_system_df(data, config): + """Generate a dataframe for a specific system to be uploaded to Zeno. + + Args: + data: The data to generate a dataframe from. + config: The configuration of the task. + + Returns: + pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system. + """ + ids = [x["doc_id"] for x in data] + system_dict = {"id": ids} + system_dict["output"] = [""] * len(ids) + + if config["output_type"] == "loglikelihood": + system_dict["output"] = [ + "correct" if x["filtered_resps"][0][1] is True else "incorrect" + for x in data + ] + elif config["output_type"] == "multiple_choice": + system_dict["output"] = [ + ", ".join([str(y[0]) for y in x["filtered_resps"]]) for x in data + ] + system_dict["num_answers"] = [len(x["filtered_resps"]) for x in data] + elif config["output_type"] == "loglikelihood_rolling": + system_dict["output"] = [str(x["filtered_resps"][0]) for x in data] + elif config["output_type"] == "generate_until": + system_dict["output"] = [str(x["filtered_resps"][0]) for x in data] + system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data] + + metrics = {} + for metric in config["metric_list"]: + if "aggregation" in metric and metric["aggregation"] == "mean": + metrics[metric["metric"]] = [x[metric["metric"]] for x in data] + + system_dict.update(metrics) + system_df = pd.DataFrame(system_dict) + return system_df + + +if __name__ == "__main__": + main()