koichi12 commited on Nov 28, 2024

Commit

1d13cae

verified ·

1 Parent(s): 84a9380

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

scripts/decode/en-ja/llama2/beam_search.sh +19 -0
scripts/decode/en-ja/llama2/greedy_inference.sh +13 -0
scripts/decode/en-ja/llama2/hf_inference.sh +13 -0
scripts/decode/en-ja/llama2/top_p_inference.sh +17 -0
scripts/decode/en-ja/llama2/top_p_inference_1.sh +20 -0
scripts/decode/en-ja/llama2/top_p_inference_2.sh +21 -0
scripts/decode/en-ja/mistral-ve/top_p_inference.sh +16 -0
scripts/decode/en-ja/mistral-ve/top_p_inference_cpo.sh +17 -0
scripts/decode/en-ja/mistral/top_p_inference_2.sh +20 -0
scripts/yans/lm-evaluation-harness/.github/workflows/new_tasks.yml +72 -0
scripts/yans/lm-evaluation-harness/.github/workflows/publish.yml +78 -0
scripts/yans/lm-evaluation-harness/.github/workflows/unit_tests.yml +95 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/__init__.py +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/__init__.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/filter.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/group.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/instance.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/metrics.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/model.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/registry.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/samplers.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/task.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/filter.py +56 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/group.py +117 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/instance.py +38 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/metrics.py +570 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/model.py +385 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/registry.py +192 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/samplers.py +198 -0
scripts/yans/lm-evaluation-harness/lm_eval/api/task.py +1674 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__init__.py +28 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/anthropic_llms.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/api_models.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/gguf.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/mamba_lm.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/nemo_lm.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuralmagic.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuron_optimum.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/openai_completions.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/optimum_lm.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/utils.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/vllm_causallms.cpython-310.pyc +0 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/anthropic_llms.py +362 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/api_models.py +641 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/huggingface.py +1356 -0
scripts/yans/lm-evaluation-harness/lm_eval/models/nemo_lm.py +537 -0

scripts/decode/en-ja/llama2/beam_search.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+MAX_INPUT_TOKENS=158
+BEAM_SIZE=50
+python /code/llm-recipes/tools/hf_inference_distrubuted.py \
+       --model /work/models/additiona_trained_hf/llama2-en-ja-continuous-pretrained-v0-dev-finetune-chunked-docs-all-averaged-841-845 \
+       -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \
+       -o /work/translation/wmt2024_test/en-ja/llama2-beam \
+       -g 0 1 2 3 4 5 6 7 \
+       --attn_implementation sdpa \
+       --dynamic_max_new_token_ratio 3.0 \
+       --num_return_sequences ${BEAM_SIZE} \
+       --num_beams ${BEAM_SIZE} \
+       --max_input_tokens ${MAX_INPUT_TOKENS} \
+       -b 158

scripts/decode/en-ja/llama2/greedy_inference.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+python /code/llm-recipes/tools/hf_inference.py \
+       --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \
+       -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl  \
+       -o /work/translation/wmt24_test/en-ja/mistral-greedy \
+       -g 0 \
+       -b 4096 \
+       --dynamic_max_new_token_ratio 3.0
+echo "Done!"

scripts/decode/en-ja/llama2/hf_inference.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+python /code/llm-recipes/tools/hf_inference.py \
+       --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \
+       -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl  \
+       -o /work/translation/wmt24_test/en-ja/mistral-greedy \
+       -g 0 \
+       -b 4096 \
+       --dynamic_max_new_token_ratio 3.0
+echo "Done!"

scripts/decode/en-ja/llama2/top_p_inference.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+i=4
+GPU_ID=4
+python /code/llm-recipes/tools/hf_inference.py \
+       --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \
+       -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i}  \
+       -o /work/translation/wmt24_test/en-ja/mistral-top-p-0.95/split_0${i} \
+       -g ${GPU_ID} \
+       -b 500 \
+       --attn_implementation sdpa \
+       --dynamic_max_new_token_ratio 3.0 \
+       --num_return_sequences 100 \
+       --do_sample \
+       --top_p 0.95 &

scripts/decode/en-ja/llama2/top_p_inference_1.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+for i in `seq 0 6`; do
+    python /code/llm-recipes/tools/hf_inference.py \
+	   --model /work/models/additiona_trained_hf/llama2-en-ja-continuous-pretrained-v0-dev-finetune-chunked-docs-all-averaged-841-845 \
+	   -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i} \
+	   -o /work/translation/wmt24_test/en-ja/llama2-top-p-0.95/split_0${i} \
+	   -g ${i} \
+	   -b 158 \
+	   --attn_implementation sdpa \
+	   --dynamic_max_new_token_ratio 3.0 \
+	   --num_return_sequences 50 \
+	   --do_sample \
+	   --top_p 0.95 \
+	   --max_input_tokens 158 &
+done
+wait

scripts/decode/en-ja/llama2/top_p_inference_2.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+for i in `seq 7 9`; do
+    GPU_ID=$((i-5))
+    python /code/llm-recipes/tools/hf_inference.py \
+	   --model /work/models/additiona_trained_hf/llama2-en-ja-continuous-pretrained-v0-dev-finetune-chunked-docs-all-averaged-841-845 \
+	   -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i} \
+	   -o /work/translation/wmt24_test/en-ja/llama2-top-p-0.95/split_0${i} \
+	   -g ${GPU_ID} \
+	   -b 158 \
+	   --attn_implementation sdpa \
+	   --dynamic_max_new_token_ratio 3.0 \
+	   --num_return_sequences 50 \
+	   --do_sample \
+	   --top_p 0.95 \
+	   --max_input_tokens 158 &
+done
+wait

scripts/decode/en-ja/mistral-ve/top_p_inference.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+python /code/llm-recipes/tools/hf_inference_distrubuted.py \
+       --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-ve-sim-chunked-docs-all-averaged-596-600 \
+       -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \
+       -o /work/translation/wmt2024_test/en-ja/mistral-ve-top-p-0.95 \
+       -g 0 1 2 3 4 5 6 7 \
+       -b 125 \
+       --attn_implementation sdpa \
+       --dynamic_max_new_token_ratio 2.0 \
+       --num_return_sequences 80 \
+       --do_sample \
+       --top_p 0.95 \
+       --max_input_tokens 125

scripts/decode/en-ja/mistral-ve/top_p_inference_cpo.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+python /code/llm-recipes/tools/hf_inference_distrubuted.py \
+       --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-ve-sim-chunked-docs-all-averaged-596-600 \
+       -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \
+       -o /work/translation/wmt2024_test/en-ja/mistral-ve-top-p-0.95-cpo \
+       -p /work/models/dpo/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-ve-sim-chunked-docs-all-cpo-lora/checkpoint-200 \
+       -g 0 1 2 3 4 5 6 7 \
+       -b 125 \
+       --attn_implementation sdpa \
+       --dynamic_max_new_token_ratio 2.0 \
+       --num_return_sequences 80 \
+       --do_sample \
+       --top_p 0.95 \
+       --max_input_tokens 125 \

scripts/decode/en-ja/mistral/top_p_inference_2.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+for i in `seq 8 9`; do
+    # minus 2 for gpu id
+    GPU_ID=$((i-2))
+    python /code/llm-recipes/tools/hf_inference.py \
+	   --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \
+	   -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i}  \
+	   -o /work/translation/wmt24_test/en-ja/mistral-top-p-0.95/split_0${i} \
+	   -g ${GPU_ID} \
+	   -b 400 \
+	   --attn_implementation sdpa \
+	   --dynamic_max_new_token_ratio 3.0 \
+	   --num_return_sequences 100 \
+	   --do_sample \
+	   --top_p 0.95 &
+done
+wait

scripts/yans/lm-evaluation-harness/.github/workflows/new_tasks.yml ADDED Viewed

	@@ -0,0 +1,72 @@

+name: Tasks Modified
+on:
+  push:
+    branches:
+      - 'main'
+  pull_request:
+    branches:
+      - 'main'
+  workflow_dispatch:
+# comment/edit out the above to stop/change the triggers
+jobs:
+  changed_files:
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    timeout-minutes: 120
+    name: Scan for changed tasks
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
+      # Uses the tj-actions/changed-files action to check for changes.
+      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
+      # The `files_yaml` input optionally takes a yaml string to specify filters,
+      # and prepends the filter name to the standard output names.
+      - name: Check task folders
+        id: changed-tasks
+        uses: tj-actions/changed-files@v44.5.2
+        with:
+          # tasks checks the tasks folder and api checks the api folder for changes
+          files_yaml: |
+            tasks:
+              - lm_eval/tasks/**
+            api:
+              - lm_eval/api/**
+          write_output_files: true
+    # The next step is optional; the files are written to the workspace by default (above).
+    # so it's just for debugging
+      - name: Run Tests
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
+          echo "One or more test file(s) has changed."
+          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+      - name: Set up Python 3.9
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+          cache: 'pip'
+          cache-dependency-path: setup.py
+      - name: Install dependencies
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+            python -m pip install --upgrade pip
+            pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu
+    #   Install optional git dependencies
+    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest
+        # if new tasks are added, run tests on them
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        run: python -m pytest tests/test_tasks.py -s -vv
+        # if api is modified, run tests on it
+      - name: Test more tasks with pytest
+        env:
+          API: true
+        if: steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: python -m pytest tests/test_tasks.py -s -vv

scripts/yans/lm-evaluation-harness/.github/workflows/publish.yml ADDED Viewed

	@@ -0,0 +1,78 @@

+name: Publish Python distribution to PyPI
+on:
+  push:
+    tags:
+      - '*'
+jobs:
+  build:
+    name: Build distribution
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.x"
+    - name: Install pypa/build
+      run: >-
+        python3 -m
+        pip install
+        build
+        --user
+    - name: Build a binary wheel and a source tarball
+      run: python3 -m build
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+  publish-to-pypi:
+    name: >-
+      Publish Python distribution to PyPI
+    if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes
+    needs:
+    - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/lm_eval
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+  publish-to-testpypi:
+    name: Publish Python distribution to TestPyPI
+    needs:
+    - build
+    runs-on: ubuntu-latest
+    environment:
+      name: testpypi
+      url: https://test.pypi.org/p/lm_eval
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution to TestPyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        repository-url: https://test.pypi.org/legacy/

scripts/yans/lm-evaluation-harness/.github/workflows/unit_tests.yml ADDED Viewed

	@@ -0,0 +1,95 @@

+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+# just comment out unwanted steps to turn off the test.
+name: Unit Tests
+on:
+  push:
+    branches:
+      - 'main'
+  pull_request:
+    branches:
+      - 'main'
+  workflow_dispatch:
+# Jobs run concurrently and steps run sequentially within a job.
+# jobs: linter and cpu_tests. Add more jobs/steps as required.
+jobs:
+  linter:
+    name: Linters
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.8
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Pre-Commit
+      env:
+        SKIP: "no-commit-to-branch,mypy"
+      uses: pre-commit/action@v3.0.1
+#       # mypy turned off for now
+#    - name: Lint with mypy
+#      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
+# Job 2
+  testcpu:
+    name: CPU Tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+    timeout-minutes: 30
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[dev,sentencepiece,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+#         Install optional git dependencies
+#                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+#        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Test with pytest
+      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
+    - name: Archive artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: output_results
+        path: |
+          test_logs/*
+  testmodels:
+    name: External LM Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.8
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+    - name: Test with pytest
+      run: python -m pytest tests/models --showlocals -s -vv
+    - name: Archive artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: output_results
+        path: |
+          test_logs/*

scripts/yans/lm-evaluation-harness/lm_eval/api/__init__.py ADDED Viewed

File without changes

scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (160 Bytes). View file

scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/filter.cpython-310.pyc ADDED Viewed

Binary file (2.72 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/group.cpython-310.pyc ADDED Viewed

Binary file (4.61 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/instance.cpython-310.pyc ADDED Viewed

Binary file (1.51 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/metrics.cpython-310.pyc ADDED Viewed

Binary file (13.2 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (14.1 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/registry.cpython-310.pyc ADDED Viewed

Binary file (5.11 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/samplers.cpython-310.pyc ADDED Viewed

Binary file (4.81 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/task.cpython-310.pyc ADDED Viewed

Binary file (43.5 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/api/filter.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Callable, Iterable, List, Union
+from lm_eval.api.instance import Instance
+class Filter(ABC):
+    """
+    Filter classes operate on a per-task level.
+    They take all model outputs (`instance.resps` for all `task.instances`)
+    across all instances of a task, and perform operations.
+    In a single run, one can configure any number of separate filters or lists of filters.
+    """
+    def __init__(self, **kwargs) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    @abstractmethod
+    def apply(self, resps: Union[List, Iterable], docs: List[dict]) -> Iterable:
+        """
+        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
+        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
+        if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
+        [<filtered resps for instance 0>, <filtered resps for instance 1>]
+        """
+        return resps
+@dataclass
+class FilterEnsemble:
+    """
+    FilterEnsemble creates a pipeline applying multiple filters.
+    Its intended usage is to stack multiple post-processing steps in order.
+    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
+    pipeline separately.
+    """
+    name: str
+    filters: List[Callable[[], Filter]]
+    def apply(self, instances: List[Instance]) -> None:
+        resps, docs = zip(*((inst.resps, inst.doc) for inst in instances))
+        resps, docs = list(resps), list(docs)
+        for f in self.filters:
+            # apply filters in sequence
+            resps = f().apply(resps, docs)
+        # add the end results after filtering to filtered_requests of their respective source instances.
+        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
+        for inst, resp in zip(instances, resps):
+            inst.filtered_resps[self.name] = resp

scripts/yans/lm-evaluation-harness/lm_eval/api/group.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import abc
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import Any, Callable, List, Optional, Union
+@dataclass
+class AggMetricConfig(dict):
+    metric: Optional[str] = None
+    aggregation: Optional[str] = "mean"
+    weight_by_size: Optional[str] = False
+    # list of filter names which should be incorporated into the aggregated metric.
+    filter_list: Optional[Union[str, list]] = "none"
+    def __post_init__(self):
+        if self.aggregation != "mean":
+            raise ValueError(
+                f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{self.aggregation}'."
+            )
+        if isinstance(self.filter_list, str):
+            self.filter_list = [self.filter_list]
+@dataclass
+class GroupConfig(dict):
+    group: Optional[str] = None
+    group_alias: Optional[str] = None
+    task: Optional[Union[str, list]] = None
+    aggregate_metric_list: Optional[
+        Union[List[AggMetricConfig], AggMetricConfig, dict]
+    ] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+    def __getitem__(self, item):
+        return getattr(self, item)
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+    def __post_init__(self):
+        if self.aggregate_metric_list is not None:
+            if isinstance(self.aggregate_metric_list, dict):
+                self.aggregate_metric_list = [self.aggregate_metric_list]
+            self.aggregate_metric_list = [
+                AggMetricConfig(**item) if isinstance(item, dict) else item
+                for item in self.aggregate_metric_list
+            ]
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+class ConfigurableGroup(abc.ABC):
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+    ) -> None:
+        self._config = GroupConfig(**config)
+    @property
+    def group(self):
+        return self._config.group
+    @property
+    def group_alias(self):
+        return self._config.group_alias
+    @property
+    def version(self):
+        return self._config.version
+    @property
+    def config(self):
+        return self._config.to_dict()
+    @property
+    def group_name(self) -> Any:
+        return self._config.group
+    def __repr__(self):
+        return (
+            f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})"
+        )

scripts/yans/lm-evaluation-harness/lm_eval/api/instance.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from dataclasses import dataclass, field
+from typing import Literal, Optional, Tuple
+OutputType = Literal[
+    "loglikelihood", "loglikelihood_rolling", "generate_until", "multiple_choice"
+]
+@dataclass
+class Instance:
+    request_type: OutputType
+    doc: dict
+    arguments: tuple
+    idx: int
+    metadata: Tuple[Optional[str], Optional[int], Optional[int]] = field(
+        default_factory=lambda: (None, None, None)
+    )
+    resps: list = field(default_factory=list)
+    filtered_resps: dict = field(default_factory=dict)
+    # initialized after init
+    task_name: Optional[str] = None
+    doc_id: Optional[int] = None
+    repeats: Optional[int] = None
+    def __post_init__(self) -> None:
+        # unpack metadata field
+        self.task_name, self.doc_id, self.repeats = self.metadata
+    @property
+    def args(self):
+        """
+        Returns (string,) where `string` is the string to calculate loglikelihood over
+        """
+        return (
+            self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
+        )

scripts/yans/lm-evaluation-harness/lm_eval/api/metrics.py ADDED Viewed

	@@ -0,0 +1,570 @@

+import logging
+import math
+import random
+import re
+import string
+from collections.abc import Iterable
+from typing import List
+import numpy as np
+import sacrebleu
+from lm_eval.api.registry import register_aggregation, register_metric
+eval_logger = logging.getLogger("lm-eval")
+# Register Aggregations First
+@register_aggregation("bypass")
+def bypass_agg(arr):
+    return 999
+@register_aggregation("mean")
+def mean(arr):
+    return sum(arr) / len(arr)
+@register_aggregation("median")
+def median(arr):
+    return arr[len(arr) // 2]
+# Certain metrics must be calculated across all documents in a benchmark.
+# We use them as aggregation metrics, paired with no-op passthrough metric fns.
+@register_aggregation("perplexity")
+def perplexity(items):
+    return math.exp(-mean(items))
+@register_aggregation("weighted_perplexity")
+def weighted_perplexity(items):
+    return math.exp(-weighted_mean(items))
+@register_aggregation("bits_per_byte")
+def bits_per_byte(items):
+    return -weighted_mean(items) / math.log(2)
+@register_aggregation("f1")
+def f1_score(items):
+    from sklearn.metrics import f1_score
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds)
+    return np.max(fscore)
+@register_aggregation("matthews_corrcoef")
+def matthews_corrcoef(items):
+    from sklearn.metrics import matthews_corrcoef
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    return matthews_corrcoef(golds, preds)
+@register_aggregation("bleu")
+def bleu(items):
+    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
+    for evaluating a generated sentence to a reference sentence. It counts matching
+    n-grams in the candidate translation to n-grams in the reference text, where
+    1-gram or unigram would be each token and a bigram comparison would be each
+    word pair. The comparison is made regardless of word order
+    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
+    Paper: https://www.aclweb.org/anthology/P02-1040/
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_bleu(preds, refs).score
+@register_aggregation("chrf")
+def chrf(items):
+    """chrF++ is a tool for automatic evaluation of machine translation output
+    based on character n-gram precision and recall enhanced with word n-grams.
+    Source: https://github.com/m-popovic/chrF
+    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
+    Higher is better  # TODO I think
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_chrf(preds, refs).score
+@register_aggregation("ter")
+def ter(items):
+    """Translation Error Rate is an error metric for machine translation that
+    measures the number of edits required to change a system output into one
+    of the references
+    Source: http://www.cs.umd.edu/~snover/tercom/
+    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
+    Lower is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_ter(preds, refs).score
+@register_aggregation("brier_score")
+def brier_score(items):  # This is a passthrough function
+    gold, predictions = list(zip(*items))
+    bs, num_class = np.array(predictions).shape
+    gold = list(gold)
+    gold_one_hot = np.eye(num_class)[gold]
+    return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1))
+@register_metric(
+    metric="brier_score",
+    higher_is_better=False,
+    output_type=["multiple_choice"],
+    aggregation="brier_score",
+)
+def brier_score_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_norm",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_norm_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_mutual_info",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="mean",
+)
+def acc_mutual_info_fn(items):  # This is a passthrough function
+    return items
+### the code used in the `exact_match_hf_evaluate` function is ported from
+### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
+### which is under the apache license.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def exact_match_hf_evaluate(
+    predictions,
+    references,
+    regexes_to_ignore=None,
+    ignore_case=False,
+    ignore_punctuation=False,
+    ignore_numbers=False,
+):
+    if regexes_to_ignore is not None:
+        for s in regexes_to_ignore:
+            predictions = np.array([re.sub(s, "", x) for x in predictions])
+            references = np.array([re.sub(s, "", x) for x in references])
+    else:
+        predictions = np.asarray(predictions)
+        references = np.asarray(references)
+    if ignore_case:
+        predictions = np.char.lower(predictions)
+        references = np.char.lower(references)
+    if ignore_punctuation:
+        repl_table = string.punctuation.maketrans("", "", string.punctuation)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+    if ignore_numbers:
+        repl_table = string.digits.maketrans("", "", string.digits)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+    score_list = predictions == references
+    return {"exact_match": np.mean(score_list)}
+###
+@register_metric(
+    metric="exact_match",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def exact_match_fn(**kwargs):
+    return exact_match_hf_evaluate(**kwargs)
+@register_metric(
+    metric="perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood",
+    aggregation="perplexity",
+)
+def perplexity_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="word_perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="weighted_perplexity",
+)
+def word_perplexity_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="byte_perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="weighted_perplexity",
+)
+def byte_perplexity_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="bits_per_byte",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="bits_per_byte",
+)
+def bits_per_byte_fn(items):  # This is a passthrough function
+    return items
+def pop_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
+def sample_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
+def mean_stderr(arr):
+    return sample_stddev(arr) / math.sqrt(len(arr))
+@register_metric(
+    metric="bypass",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice", "generate_until"],
+    aggregation="bypass",
+)
+def bypass(items):
+    return None
+@register_metric(
+    metric="mcc",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="matthews_corrcoef",
+)
+def mcc_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="f1",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="f1",
+)
+def f1_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="bleu",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="bleu",
+)
+def bleu_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="chrf",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="chrf",
+)
+def chrf_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="ter",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="ter",
+)
+def ter_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_all",
+    higher_is_better=True,
+    output_type="loglikelihood",
+    aggregation="mean",
+)
+def acc_all(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        paragraph_id = doc["idx"]["paragraph"]
+        question_id = doc["idx"]["question"]
+        if (paragraph_id, question_id) not in question_scoring_dict:
+            question_scoring_dict[(paragraph_id, question_id)] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
+    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def acc_all_stderr(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        question_id = doc["idx"]["question"]
+        if question_id not in question_scoring_dict:
+            question_scoring_dict[question_id] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[question_id].append(gold_label == pred)
+    acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    """Compute max metric between prediction and each ground truth."""
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+def weighted_mean(items):
+    a, b = zip(*items)
+    return sum(a) / sum(b)
+def is_non_str_iterable(obj):
+    return isinstance(obj, Iterable) and not isinstance(obj, str)
+def _sacreformat(refs, preds):
+    """Format refs and preds for sacrebleu corpus calculation. It is very particular"""
+    # Sacrebleu expects (List[str], List[List[str])
+    #   e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
+    # Note [ref1_stream] is the first reference for each pred.
+    # So lists are size N and (M, N) for N preds and M possible refs for each pred
+    # This is a different order of dimensions that I would expect
+    # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
+    # Must become List[List[str]] with the inner list corresponding to preds
+    if not is_non_str_iterable(refs):
+        refs = list(refs)
+    if not is_non_str_iterable(refs[0]):
+        refs = [[ref] for ref in refs]
+    refs = list(zip(*refs))
+    # Note the number of refs in each ref list much match the number of preds
+    # We expect preds to be List[str] or List[List[str]]. Must become List[str]
+    if not is_non_str_iterable(preds):
+        preds = list(preds)
+    if is_non_str_iterable(preds[0]):
+        assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
+        preds = [pred[0] for pred in preds]
+    return refs, preds
+# stderr stuff
+class _bootstrap_internal:
+    def __init__(self, f, n) -> None:
+        self.f = f
+        self.n = n
+    def __call__(self, v):
+        i, xs = v
+        rnd = random.Random()
+        rnd.seed(i)
+        res = []
+        for _ in range(self.n):
+            res.append(self.f(rnd.choices(xs, k=len(xs))))
+        return res
+def bootstrap_stderr(f, xs, iters):
+    import multiprocessing as mp
+    pool = mp.Pool(mp.cpu_count())
+    # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
+    # equivalent to stderr calculated without Bessel's correction in the stddev.
+    # Unfortunately, I haven't been able to figure out what the right correction is
+    # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
+    # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
+    # Thankfully, shouldn't matter because our samples are pretty big usually anyways
+    res = []
+    chunk_size = min(1000, iters)
+    from tqdm import tqdm
+    print("bootstrapping for stddev:", f.__name__)
+    for bootstrap in tqdm(
+        pool.imap(
+            _bootstrap_internal(f, chunk_size),
+            [(i, xs) for i in range(iters // chunk_size)],
+        ),
+        total=iters // chunk_size,
+    ):
+        # sample w replacement
+        res.extend(bootstrap)
+    pool.close()
+    return sample_stddev(res)
+def stderr_for_metric(metric, bootstrap_iters: int):
+    if bootstrap_iters <= 0:
+        # return no function (don't compute stderr) if bootstrap iters = 0
+        return None
+    bootstrappable = [
+        median,
+        matthews_corrcoef,
+        f1_score,
+        perplexity,
+        bleu,
+        chrf,
+        ter,
+    ]
+    if metric in bootstrappable:
+        return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
+    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
+    return stderr.get(metric, None)
+def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
+    # Used to aggregate bootstrapped stderrs across subtasks in a group,
+    # when we are weighting by the size of each subtask.
+    #
+    assert len(stderrs) == len(sizes)
+    # formula source: https://en.wikipedia.org/wiki/Pooled_variance
+    # and: https://stats.stackexchange.com/a/4841331
+    # this empirically seems to match running `stderr_for_metric` on all instances
+    # from the subtasks concatenated with each other.
+    pooled_sample_var = (
+        sum([(size - 1) * stderr**2 * size for size, stderr in zip(sizes, stderrs)])
+    ) / (sum(sizes) - len(sizes))
+    return np.sqrt(pooled_sample_var / sum(sizes))
+def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
+    assert (
+        metrics is not None
+    ), "Need to pass a list of each subtask's metric for this stderr aggregation"
+    assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
+    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.
+    # This formula depends on sample means.
+    # removed because it seems to give erroneously huge stderrs for groupings of tasks
+    # and does not seem to match up with bootstrap-calculated stderrs for groups.
+    ### don't use this unless a statistician has told you it's the right thing to do ###
+    # accumulators: we'll aggregate pairwise N - 1 times
+    variance = stderrs[0] ** 2
+    curr_size = sizes[0]
+    curr_score = metrics[0]
+    for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]):
+        curr_score = ((curr_score * curr_size) + (score * size)) / (
+            curr_size + size
+        )  # NOTE: this assumes our aggregation fn is "mean"
+        variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / (
+            curr_size + size - 1
+        ) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * (
+            curr_score - score
+        ) ** 2
+    return np.sqrt(variance)
+def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
+    # A helper function that is used to aggregate
+    # subtask scores cross-task.
+    # TODO: does not hold for non-mean aggregations
+    if not weight_by_size:
+        sizes = [1] * len(sizes)
+    assert len(metrics) == len(sizes)
+    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)

scripts/yans/lm-evaluation-harness/lm_eval/api/model.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import abc
+import hashlib
+import json
+import logging
+import os
+from typing import Dict, List, Optional, Tuple, Type, TypeVar
+import transformers
+from sqlitedict import SqliteDict
+from tqdm import tqdm
+from lm_eval import utils
+eval_logger = logging.getLogger("lm-eval")
+T = TypeVar("T", bound="LM")
+class LM(abc.ABC):
+    def __init__(self) -> None:
+        """Defines the interface that should be implemented by all LM subclasses.
+        LMs are assumed to take text (strings) as input and yield strings as output
+        (inputs/outputs should be tokenization-agnostic.)
+        """
+        # set rank and world size to a single process, by default.
+        self._rank = 0
+        self._world_size = 1
+        self.cache_hook = CacheHook(None)
+    @abc.abstractmethod
+    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+        """Compute log-likelihood of generating a continuation from a context.
+        Downstream tasks should attempt to use loglikelihood instead of other
+        LM calls whenever possible.
+        :param requests: list[Instance]
+            A list of Instance objects, with property `args` which returns a tuple (context, continuation).
+            `context: str`
+                Context string. Implementations of LM must be able to handle an
+                empty context string.
+            `continuation: str`
+                The continuation over which log likelihood will be calculated. If
+                there is a word boundary, the space should be in the continuation.
+                For example, context="hello" continuation=" world" is correct.
+        :return: list[tuple[float, bool]]
+            A list of pairs (logprob, isgreedy)
+            `logprob: float`
+                The log probability of `continuation`.
+            `isgreedy`:
+                Whether `continuation` would be generated by greedy sampling from `context`.
+        """
+        pass
+    @abc.abstractmethod
+    def loglikelihood_rolling(self, requests) -> List[float]:
+        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
+        - We will use the full max context length of the model.
+        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
+        the max context length.
+        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
+          which may simply concatenate multiple documents together.
+        - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
+          multiple chunks, the last input will still a full-sized context.
+          Example:
+            Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
+            Prefix: BOS/EOS
+            Max context length: 4
+            Resulting input/prediction pairs:
+                INPUT:  BOS   0   1   2
+                PRED:     0   1   2   3
+                INPUT:    3   4   5   6
+                PRED:     4   5   6   7
+                INPUT:    5   6   7   8
+                PRED:             8   9
+          Observe that:
+            1. Each token is predicted exactly once
+            2. For the last pair, we provide the full context, but only score the last two tokens
+        :param requests: list[Instance]
+            A list of Instance objects with property `args` which returns a tuple (context,).
+            string: str
+                String for which we are computing overall loglikelihood
+        :return: list[tuple[float]]
+            A list of tuples (logprob,)
+            logprob: float
+                The log probability of `context` conditioned on the BOS/EOS token.
+                Can also be overridden for custom cases by `prefix_token_id`.
+        """
+        pass
+    # TODO: Add an optional max length
+    @abc.abstractmethod
+    def generate_until(self, requests) -> List[str]:
+        """Generate greedily until a stopping sequence
+        :param requests: list[Instance]
+            A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs).
+            context: str
+                Context string
+            gen_kwargs: dict
+                A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc.
+        :return: list[str]
+            A list of model generated continuations.
+            continuation: str
+                The generated continuation.
+        """
+        pass
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
+        :param chat_history: list[dict[str, str]]
+            A list of dictionaries with keys 'role' and 'content'.
+            Values are strings representing the role name and the content of the message, respectively.
+        :return: str
+            A string representing the chat history in a format that can be used as input to the LM.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'apply_chat_template' method for your model type."
+        )
+    @classmethod
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LM class.
+        """
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+    @classmethod
+    def create_from_arg_obj(
+        cls: Type[T], arg_dict: dict, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given arg_obj
+        Parameters:
+        - arg_obj: A dict containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LM class.
+        """
+        additional_config = {} if additional_config is None else additional_config
+        additional_config = {
+            k: v for k, v in additional_config.items() if v is not None
+        }
+        return cls(**arg_dict, **additional_config)
+    @property
+    def rank(self):
+        # used in the case of parallelism. Hardcoded to
+        # ensure no errors arise using API models which do
+        # not support multi-device parallelism nor expect it.
+        return self._rank
+    @property
+    def world_size(self):
+        # used in the case of parallelism. Hardcoded to
+        # ensure no errors arise using API models which do
+        # not support multi-device parallelism nor expect it.
+        return self._world_size
+    @property
+    def tokenizer_name(self) -> str:
+        """Must be defined for LM subclasses which implement Chat Templating.
+        Should return the name of the tokenizer or chat template used.
+        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'tokenizer_name' property."
+        )
+    @property
+    def chat_template(self) -> str:
+        """Must be defined for LM subclasses that implement Chat Templating.
+        Should return the structure of the chat template applied to user/assistant messages.
+        This is used only to save in the experiment results for reproducibility.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'chat_template' property."
+        )
+    def set_cache_hook(self, cache_hook) -> None:
+        self.cache_hook = cache_hook
+### SQLite-based caching of LM responses
+def hash_args(attr, args):
+    dat = json.dumps([attr] + list(args))
+    return hashlib.sha256(dat.encode("utf-8")).hexdigest()
+class CacheHook:
+    def __init__(self, cachinglm) -> None:
+        if cachinglm is None:
+            self.dbdict = None
+            return
+        self.dbdict = cachinglm.dbdict
+    def add_partial(self, attr, req, res) -> None:
+        if self.dbdict is None:
+            return
+        hsh = hash_args(attr, req)
+        self.dbdict[hsh] = res
+class CachingLM:
+    def __init__(self, lm, cache_db) -> None:
+        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
+        :param lm: LM
+            Underlying LM
+        :param cache_db: str
+            Path to cache db
+        """
+        self.lm = lm
+        self.cache_db = cache_db
+        if os.path.dirname(cache_db):
+            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
+        self.dbdict = SqliteDict(cache_db, autocommit=True)
+        # add hook to lm
+        lm.set_cache_hook(self.get_cache_hook())
+    def __getattr__(self, attr: str):
+        lm_attr = getattr(self.lm, attr)
+        if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
+            eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
+            return lm_attr
+        def fn(requests):
+            res = []
+            remaining_reqs = []
+            warned = False
+            # figure out which ones are cached and which ones are new
+            eval_logger.info(
+                f"Loading '{attr}' responses from cache '{self.cache_db}' where possible..."
+            )
+            for req in tqdm(requests, desc="Checking cached requests"):
+                hsh = hash_args(attr, req.args)
+                if attr == "generate_until" and req.args[1].get("do_sample", False):
+                    # when we are doing non-greedy generation, don't use the cache
+                    # (else every "randomly sampled" generation would be identical for repeats > 1).
+                    if not warned:
+                        eval_logger.warning(
+                            f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
+                        )
+                        warned = True
+                    res.append(None)
+                    remaining_reqs.append(req)
+                elif hsh in self.dbdict:
+                    ob = self.dbdict[hsh]
+                    assert ob is not None
+                    res.append(ob)
+                else:
+                    res.append(None)
+                    remaining_reqs.append(req)
+            eval_logger.info(
+                f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}"
+            )
+            # actually run the LM on the requests that do not have cached results
+            rem_res = getattr(self.lm, attr)(remaining_reqs)
+            # stick the new ones back into the list and also cache any of the new ones
+            resptr = 0
+            for req, r in zip(remaining_reqs, rem_res):
+                while res[resptr] is not None:
+                    resptr += 1
+                res[resptr] = r
+                # caching
+                hsh = hash_args(attr, req.args)
+                self.dbdict[hsh] = r
+            self.dbdict.commit()
+            return res
+        return fn
+    def get_cache_hook(self):
+        return CacheHook(self)
+class TemplateLM(LM):
+    """
+    A class acting as intermediary between the LM base class
+    and boilerplate often included in other LM subclasses.
+    """
+    @property
+    @abc.abstractmethod
+    def eot_token_id(self):
+        pass
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        return self.eot_token_id
+    @abc.abstractmethod
+    def tok_encode(self, string: str, **kwargs) -> List[int]:
+        """
+        Tokenize a string using the model's tokenizer and return a list of token IDs.
+        """
+        pass
+    @abc.abstractmethod
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        pass
+    def _encode_pair(
+        self, context: str, continuation: str
+    ) -> Tuple[List[int], List[int]]:
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        model_class = getattr(self, "AUTO_MODEL_CLASS", None)
+        if model_class == transformers.AutoModelForSeq2SeqLM:
+            context_enc = self.tok_encode(context)
+            continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
+        else:
+            whole_enc = self.tok_encode(context + continuation)
+            context_enc = self.tok_encode(context)
+            context_enc_len = len(context_enc)
+            continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+    def loglikelihood(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                # BOS or EOS as context
+                context_enc, continuation_enc = (
+                    [self.prefix_token_id],
+                    self.tok_encode(continuation),
+                )
+            else:
+                context_enc, continuation_enc = self._encode_pair(context, continuation)
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
+    @abc.abstractmethod
+    def loglikelihood_rolling(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[float]:
+        pass
+    @abc.abstractmethod
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        pass

scripts/yans/lm-evaluation-harness/lm_eval/api/registry.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import logging
+from typing import Callable, Dict
+import evaluate as hf_evaluate
+from lm_eval.api.model import LM
+eval_logger = logging.getLogger("lm-eval")
+MODEL_REGISTRY = {}
+def register_model(*names):
+    # either pass a list or a single alias.
+    # function receives them as a tuple of strings
+    def decorate(cls):
+        for name in names:
+            assert issubclass(
+                cls, LM
+            ), f"Model '{name}' ({cls.__name__}) must extend LM class"
+            assert (
+                name not in MODEL_REGISTRY
+            ), f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
+            MODEL_REGISTRY[name] = cls
+        return cls
+    return decorate
+def get_model(model_name):
+    try:
+        return MODEL_REGISTRY[model_name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}"
+        )
+TASK_REGISTRY = {}
+GROUP_REGISTRY = {}
+ALL_TASKS = set()
+func2task_index = {}
+def register_task(name):
+    def decorate(fn):
+        assert (
+            name not in TASK_REGISTRY
+        ), f"task named '{name}' conflicts with existing registered task!"
+        TASK_REGISTRY[name] = fn
+        ALL_TASKS.add(name)
+        func2task_index[fn.__name__] = name
+        return fn
+    return decorate
+def register_group(name):
+    def decorate(fn):
+        func_name = func2task_index[fn.__name__]
+        if name in GROUP_REGISTRY:
+            GROUP_REGISTRY[name].append(func_name)
+        else:
+            GROUP_REGISTRY[name] = [func_name]
+            ALL_TASKS.add(name)
+        return fn
+    return decorate
+OUTPUT_TYPE_REGISTRY = {}
+METRIC_REGISTRY = {}
+METRIC_AGGREGATION_REGISTRY = {}
+AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
+HIGHER_IS_BETTER_REGISTRY = {}
+FILTER_REGISTRY = {}
+DEFAULT_METRIC_REGISTRY = {
+    "loglikelihood": [
+        "perplexity",
+        "acc",
+    ],
+    "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
+    "multiple_choice": ["acc", "acc_norm"],
+    "generate_until": ["exact_match"],
+}
+def register_metric(**args):
+    # TODO: do we want to enforce a certain interface to registered metrics?
+    def decorate(fn):
+        assert "metric" in args
+        name = args["metric"]
+        for key, registry in [
+            ("metric", METRIC_REGISTRY),
+            ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
+            ("aggregation", METRIC_AGGREGATION_REGISTRY),
+        ]:
+            if key in args:
+                value = args[key]
+                assert (
+                    value not in registry
+                ), f"{key} named '{value}' conflicts with existing registered {key}!"
+                if key == "metric":
+                    registry[name] = fn
+                elif key == "aggregation":
+                    registry[name] = AGGREGATION_REGISTRY[value]
+                else:
+                    registry[name] = value
+        return fn
+    return decorate
+def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
+    if not hf_evaluate_metric:
+        if name in METRIC_REGISTRY:
+            return METRIC_REGISTRY[name]
+        else:
+            eval_logger.warning(
+                f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
+            )
+    try:
+        metric_object = hf_evaluate.load(name)
+        return metric_object.compute
+    except Exception:
+        eval_logger.error(
+            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
+        )
+def register_aggregation(name: str):
+    def decorate(fn):
+        assert (
+            name not in AGGREGATION_REGISTRY
+        ), f"aggregation named '{name}' conflicts with existing registered aggregation!"
+        AGGREGATION_REGISTRY[name] = fn
+        return fn
+    return decorate
+def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
+    try:
+        return AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(f"{name} not a registered aggregation metric!")
+def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
+    try:
+        return METRIC_AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(f"{name} metric is not assigned a default aggregation!")
+def is_higher_better(metric_name) -> bool:
+    try:
+        return HIGHER_IS_BETTER_REGISTRY[metric_name]
+    except KeyError:
+        eval_logger.warning(
+            f"higher_is_better not specified for metric '{metric_name}'!"
+        )
+def register_filter(name):
+    def decorate(cls):
+        if name in FILTER_REGISTRY:
+            eval_logger.info(
+                f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}"
+            )
+        FILTER_REGISTRY[name] = cls
+        return cls
+    return decorate
+def get_filter(filter_name: str) -> type:
+    try:
+        return FILTER_REGISTRY[filter_name]
+    except KeyError:
+        eval_logger.warning(f"filter `{filter_name}` is not registered!")

scripts/yans/lm-evaluation-harness/lm_eval/api/samplers.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from functools import partial
+import datasets
+class ContextSampler:
+    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
+        self.rnd = rnd
+        if not self.rnd:
+            raise ValueError(
+                "A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
+            )
+        self.task = task
+        self.config = task._config
+        self.target_delimiter = self.config.target_delimiter
+        self.fewshot_delimiter = self.config.fewshot_delimiter
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_text", None) is not None
+        ):
+            self.doc_to_text = partial(
+                self.task.doc_to_text,
+                doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
+            )
+        else:
+            self.doc_to_text = self.task.doc_to_text
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_target", None) is not None
+        ):
+            self.doc_to_target = partial(
+                self.task.doc_to_target,
+                doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
+            )
+        else:
+            self.doc_to_target = self.task.doc_to_target
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_choice", None) is not None
+        ):
+            self.doc_to_choice = partial(
+                self.task.doc_to_choice,
+                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
+            )
+        else:
+            self.doc_to_choice = self.task.doc_to_choice
+        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
+        if fewshot_indices:  # subset few-shot docs from
+            if not isinstance(self.docs, datasets.Dataset):
+                raise ValueError(
+                    "Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously"
+                )
+            self.docs = self.docs.select(fewshot_indices)
+    def get_context(self, doc, num_fewshot):
+        # draw an extra fewshot sample if using same split as evaluating on
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+        labeled_examples = ""
+        for doc in selected_docs:
+            doc_content = self.doc_to_text(doc)
+            doc_target = self.doc_to_target(doc)
+            labeled_examples += (
+                doc_content
+                if self.config.doc_to_choice is None or isinstance(doc_content, str)
+                else self.doc_to_choice(doc)[doc_content]
+            )
+            labeled_examples += self.target_delimiter
+            if doc_target != "":
+                labeled_examples += (
+                    str(doc_target[0])
+                    if isinstance(doc_target, list)
+                    else doc_target
+                    if self.config.doc_to_choice is None or isinstance(doc_target, str)
+                    else str(self.doc_to_choice(doc)[doc_target])
+                )
+                labeled_examples += self.fewshot_delimiter
+        return labeled_examples
+    def get_chat_context(
+        self,
+        doc,
+        num_fewshot,
+        fewshot_as_multiturn: bool = False,
+    ):
+        chat_history = []
+        # draw an extra fewshot sample if using same split as evaluating on
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+        if fewshot_as_multiturn:
+            for doc in selected_docs:
+                doc_content = self.doc_to_text(doc)
+                doc_target = self.doc_to_target(doc)
+                chat_history.append(
+                    {
+                        "role": "user",
+                        "content": doc_content
+                        if self.config.doc_to_choice is None
+                        or isinstance(doc_content, str)
+                        else self.doc_to_choice(doc)[doc_content],
+                    }
+                )
+                chat_history.append(
+                    {
+                        "role": "assistant",
+                        "content": str(doc_target[0])
+                        if isinstance(doc_target, list)
+                        else doc_target
+                        if self.config.doc_to_choice is None
+                        or isinstance(doc_target, str)
+                        else str(self.doc_to_choice(doc)[doc_target]),
+                    }
+                )
+        else:
+            # get fewshot context as one user turn
+            chat_history.append(
+                {"role": "user", "content": self.get_context(doc, num_fewshot)}
+            )
+        return chat_history
+    def sample(self, n):
+        """
+        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
+        """
+        return self.rnd.sample(self.docs, n)
+class FirstNSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """
+        Draw the first `n` samples in order from the specified split.
+        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
+        """
+        assert (
+            n <= len(self.docs)
+        ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        return self.docs[:n]
+class BalancedSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """
+        TODO: this should return approximately class-balanced samples from our fewshot examples.
+        TODO: what order should they be in? maybe random?
+        """
+        pass
+class ManualSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """ """
+        pass
+SAMPLER_REGISTRY = {
+    "default": ContextSampler,
+    "first_n": FirstNSampler,
+}
+def get_sampler(name):
+    try:
+        return SAMPLER_REGISTRY[name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}"
+        )

scripts/yans/lm-evaluation-harness/lm_eval/api/task.py ADDED Viewed

	@@ -0,0 +1,1674 @@

+import abc
+import ast
+import logging
+import random
+import re
+from collections.abc import Callable
+from copy import deepcopy
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+)
+import datasets
+import numpy as np
+from tqdm import tqdm
+from lm_eval import utils
+from lm_eval.api import samplers
+from lm_eval.api.instance import Instance, OutputType
+from lm_eval.api.metrics import bits_per_byte, mean, weighted_perplexity
+from lm_eval.api.registry import (
+    AGGREGATION_REGISTRY,
+    DEFAULT_METRIC_REGISTRY,
+    get_aggregation,
+    get_metric,
+    get_metric_aggregation,
+    is_higher_better,
+)
+from lm_eval.caching.cache import load_from_cache, save_to_cache
+from lm_eval.filters import build_filter_ensemble
+from lm_eval.prompts import get_prompt
+ALL_OUTPUT_TYPES = [
+    "loglikelihood",
+    "multiple_choice",
+    "loglikelihood_rolling",
+    "generate_until",
+]
+eval_logger = logging.getLogger("lm-eval")
+@dataclass
+class TaskConfig(dict):
+    # task naming/registry
+    task: Optional[str] = None
+    task_alias: Optional[str] = None
+    tag: Optional[Union[str, list]] = None
+    group: Optional[Union[str, list]] = None
+    # HF dataset options.
+    # which dataset to use,
+    # and what splits for what purpose
+    dataset_path: Optional[str] = None
+    dataset_name: Optional[str] = None
+    dataset_kwargs: Optional[dict] = None
+    training_split: Optional[str] = None
+    validation_split: Optional[str] = None
+    test_split: Optional[str] = None
+    fewshot_split: Optional[str] = (
+        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+    )
+    # formatting / prompting options.
+    # see docs/advanced_task_guide.md for more info
+    process_docs: Optional[Callable] = None
+    doc_to_text: Optional[Union[Callable, str]] = None
+    doc_to_target: Optional[Union[Callable, str]] = None
+    doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
+    process_results: Optional[Union[Callable, str]] = None
+    use_prompt: Optional[str] = None
+    description: str = ""
+    target_delimiter: str = " "
+    fewshot_delimiter: str = "\n\n"
+    fewshot_config: Optional[dict] = None
+    # runtime configuration options
+    num_fewshot: Optional[int] = None
+    # scoring options
+    metric_list: Optional[list] = None
+    output_type: OutputType = "generate_until"
+    generation_kwargs: Optional[dict] = None
+    repeats: int = 1
+    filter_list: Optional[Union[str, list]] = None
+    should_decontaminate: bool = False
+    doc_to_decontamination_query: Optional[str] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+    def __post_init__(self) -> None:
+        if self.group is not None:
+            eval_logger.warning(
+                "A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information."
+            )
+            if self.tag is None:
+                self.tag = self.group
+            else:
+                raise ValueError(
+                    "Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4."
+                )
+        if self.generation_kwargs is not None:
+            if self.output_type != "generate_until":
+                eval_logger.warning(
+                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
+                )
+            if "temperature" in self.generation_kwargs:
+                self.generation_kwargs["temperature"] = float(
+                    self.generation_kwargs["temperature"]
+                )
+            if "until" not in self.generation_kwargs:
+                self.generation_kwargs["until"] = [self.fewshot_delimiter]
+        else:
+            if self.output_type == "generate_until":
+                # ensure that we greedily generate in absence of explicit arguments otherwise
+                self.generation_kwargs = {
+                    "until": (
+                        None
+                        if self.fewshot_delimiter is None
+                        else [self.fewshot_delimiter]
+                    ),
+                    "do_sample": False,
+                }
+    def __getitem__(self, item):
+        return getattr(self, item)
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if v is None:
+                cfg_dict.pop(k)
+            elif k == "metric_list":
+                for metric_dict in v:
+                    for metric_key, metric_value in metric_dict.items():
+                        if callable(metric_value):
+                            metric_dict[metric_key] = self.serialize_function(
+                                metric_value, keep_callable=keep_callable
+                            )
+                cfg_dict[k] = v
+            elif callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+class Task(abc.ABC):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, and evaluation methods. See BoolQ for a simple example implementation
+    A `doc` can be any python object which represents one instance of evaluation.
+    This is usually a dictionary e.g.
+        {"question": ..., "answer": ...} or
+        {"question": ..., question, answer)
+    """
+    VERSION: Optional[Union[int, str]] = None
+    # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
+    # or a path to a custom `datasets` loading script.
+    DATASET_PATH: Optional[str] = None
+    # The name of a subset within `DATASET_PATH`.
+    DATASET_NAME: Optional[str] = None
+    OUTPUT_TYPE: Optional[OutputType] = None
+    def __init__(
+        self,
+        data_dir: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        download_mode: Optional[datasets.DownloadMode] = None,
+        config: Optional[Mapping] = None,  # Union[dict, TaskConfig]
+    ) -> None:
+        """
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.download(data_dir, cache_dir, download_mode)
+        self._training_docs: Optional[list] = None
+        self._fewshot_docs: Optional[list] = None
+        self._instances: Optional[List[Instance]] = None
+        self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()
+        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        self.fewshot_rnd: Optional[random.Random] = (
+            None  # purposely induce errors in case of improper usage
+        )
+    def download(
+        self,
+        data_dir: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        download_mode=None,
+    ) -> None:
+        """Downloads and returns the task dataset.
+        Override this method to download the dataset from a custom API.
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            by setting the shell environment variable, `HF_DATASETS_CACHE`,
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.dataset = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            data_dir=data_dir,
+            cache_dir=cache_dir,
+            download_mode=download_mode,
+        )
+    @property
+    def config(self) -> TaskConfig:
+        """Returns the TaskConfig associated with this class."""
+        return self._config
+    @abc.abstractmethod
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        pass
+    @abc.abstractmethod
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        pass
+    @abc.abstractmethod
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        pass
+    def training_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def validation_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def test_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def fewshot_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        if self.has_training_docs():
+            return self.training_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            eval_logger.warning(
+                f"[Task: {self.config.task}] has_training_docs and has_validation_docs are False"
+                ", using test_docs as fewshot_docs but this is not recommended."
+            )
+            return self.test_docs()
+    def _process_doc(self, doc: dict) -> dict:
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+    @property
+    def instances(self) -> List[Instance]:
+        """After calling `task.build_all_requests()`, tasks
+        maintain a list of the dataset instances which will be evaluated.
+        """
+        return self._instances
+    def fewshot_examples(self, k, rnd):
+        if self._training_docs is None:
+            self._training_docs = list(self.training_docs())
+        return rnd.sample(self._training_docs, k)
+    def doc_to_decontamination_query(self, doc):
+        raise NotImplementedError(
+            "Override doc_to_decontamination_query with document specific decontamination query."
+        )
+    @abc.abstractmethod
+    def doc_to_text(self, doc):
+        pass
+    @abc.abstractmethod
+    def doc_to_target(self, doc):
+        pass
+    def build_all_requests(
+        self,
+        *,
+        limit: Union[int, None] = None,
+        rank: int = 0,
+        world_size: int = 1,
+        cache_requests: bool = False,
+        rewrite_requests_cache: bool = False,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+        tokenizer_name: str = "",
+    ) -> None:
+        """Build a set of Instances for a task, and store them in task.instances"""
+        # used with caching
+        og_limit = limit
+        cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
+        cache_key += "-chat_template" if apply_chat_template else ""
+        cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else ""
+        cache_key += (
+            f"-system_prompt_hash{utils.hash_string(system_instruction)}"
+            if system_instruction is not None
+            else ""
+        )
+        cache_key += f"-tokenizer{tokenizer_name}"
+        cached_instances = load_from_cache(file_name=cache_key)
+        if cache_requests and cached_instances and not rewrite_requests_cache:
+            cached_instances = cached_instances[:limit]
+            flattened_instances = [
+                instance
+                for instance_group in cached_instances
+                for instance in instance_group
+            ]
+            self._instances = flattened_instances
+            return
+        eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...")
+        instances = []
+        # process all documents when caching is specified for simplicity
+        if (
+            cache_requests
+            and (not cached_instances or rewrite_requests_cache)
+            and limit is not None
+        ):
+            limit = None
+        doc_id_docs = list(
+            self.doc_iterator(rank=rank, limit=limit, world_size=world_size)
+        )
+        num_docs = len(doc_id_docs)
+        for doc_id, doc in tqdm(
+            doc_id_docs,
+            total=num_docs,
+        ):
+            # sample fewshot context #TODO: need to offset doc_id by rank now!
+            fewshot_ctx = self.fewshot_context(
+                doc,
+                0 if self.config.num_fewshot is None else self.config.num_fewshot,
+                system_instruction,
+                apply_chat_template,
+                fewshot_as_multiturn,
+                chat_template,
+            )
+            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
+            inst = self.construct_requests(
+                doc=doc,
+                ctx=fewshot_ctx,
+                metadata=(self.config["task"], doc_id, self.config.repeats),
+            )
+            if not isinstance(inst, list):
+                inst = [inst]
+            instances.append(inst)
+        # now flatten, this is to allow slicing to work with pickles
+        sliced_instances = instances[:og_limit]
+        flattened_instances = [
+            instance
+            for instance_group in sliced_instances
+            for instance in instance_group
+        ]
+        self._instances = flattened_instances
+        if len(self._instances) == 0:
+            raise ValueError("task.build_requests() did not find any docs!")
+        if cache_requests and (not cached_instances or rewrite_requests_cache):
+            save_to_cache(file_name=cache_key, obj=instances)
+    @abc.abstractmethod
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        :param doc_idx: int
+            The index of a document within `self.test_docs()` or `self.validation_docs()`,
+            whichever is the main split used.
+        :param repeats: int
+        TODO: update this docstring
+            The number of times each instance in a dataset is inferred on. Defaults to 1,
+            can be increased for techniques like majority voting.
+        """
+        pass
+    @abc.abstractmethod
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        pass
+    @abc.abstractmethod
+    def aggregation(self):
+        """
+        :returns: {str: [metric_score] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metric scores
+        """
+        pass
+    @abc.abstractmethod
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        pass
+    def get_config(self, key: str) -> Any:
+        return getattr(self._config, key, None)
+    @classmethod
+    def count_bytes(cls, doc):
+        """Used for byte-level perplexity metrics in rolling loglikelihood"""
+        return len(doc.encode("utf-8"))
+    @classmethod
+    def count_words(cls, doc):
+        """Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))
+    @utils.positional_deprecated
+    def fewshot_context(
+        self,
+        doc,
+        num_fewshot,
+        rnd=None,
+        description=None,
+    ):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param rnd: random.Random
+            The pseudo-random number generator used to randomly sample examples.
+            WARNING: This is currently a required arg although it's optionalized with a default `None`.
+        :param description: str
+            The task's description that will be prepended to the fewshot examples.
+        :returns: str
+            The fewshot context.
+        """
+        if rnd is None:
+            if self.fewshot_rnd is not None:
+                rnd = self.fewshot_rnd
+            else:
+                raise ValueError(
+                    "A `random.Random` generator argument must be provided to `rnd`"
+                )
+        description = description if description else ""
+        if num_fewshot == 0:
+            labeled_examples = ""
+        else:
+            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
+            if self.has_training_docs():
+                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            else:
+                if self._fewshot_docs is None:
+                    self._fewshot_docs = list(
+                        self.validation_docs()
+                        if self.has_validation_docs()
+                        else self.test_docs()
+                    )
+                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+            labeled_examples = (
+                "\n\n".join(
+                    [
+                        self.doc_to_text(doc) + self.doc_to_target(doc)
+                        for doc in fewshotex
+                    ]
+                )
+                + "\n\n"
+            )
+        example = self.doc_to_text(doc)
+        return description + labeled_examples + example
+    def apply_filters(self) -> Optional[List[Instance]]:
+        """Iterates over FilterEnsembles and applies them to instances"""
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+    def dump_config(self) -> dict:
+        """Returns the config as a dictionary."""
+        # TODO: this should only return the overrides applied to a non-YAML task's configuration.
+        # (num_fewshot)
+        return self.config.to_dict()
+    def set_config(self, key: str, value: Any, update: bool = False) -> None:
+        """Set or update the configuration for a given key."""
+        if key is None:
+            raise ValueError("Key must be provided.")
+        if update:
+            current_value = getattr(self._config, key, {})
+            if not isinstance(current_value, dict):
+                raise TypeError(
+                    f"Expected a dict for key '{key}', got {type(current_value).__name__} instead."
+                )
+            current_value.update(value)
+        else:
+            setattr(self._config, key, value)
+    def override_metric(self, metric_name: str) -> None:
+        """
+        Override the default metrics used for evaluation with custom metrics.
+        Parameters:
+        - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
+        """
+        (
+            self._metric_fn_list,
+            self._aggregation_list,
+            self._metric_fn_kwargs,
+            self._higher_is_better,
+        ) = ({}, {}, {}, {})
+        self._metric_fn_list[metric_name] = get_metric(metric_name)
+        self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
+        self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        self._metric_fn_kwargs[metric_name] = {}
+        if not isinstance(self, ConfigurableTask):
+            self.process_results = lambda x, y: {metric_name: get_metric(metric_name)}
+            self.aggregation = lambda: {
+                metric_name: get_metric_aggregation(metric_name)
+            }
+        setattr(self._config, "metric_list", [{"metric": metric_name}])
+        setattr(self._config, "process_results", None)
+    def set_fewshot_seed(self, seed: Optional[int] = None) -> None:
+        self.fewshot_rnd = random.Random(seed)
+        if hasattr(self, "sampler"):
+            self.sampler.rnd = self.fewshot_rnd
+    @property
+    def eval_docs(self) -> Union[datasets.Dataset, List[dict]]:
+        if self.has_test_docs():
+            return self.test_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            raise ValueError(
+                f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+            )
+    def doc_iterator(
+        self, *, rank: int = 0, limit: Union[int, None] = None, world_size: int = 1
+    ) -> Iterator[Tuple[int, Any]]:
+        limit = int(limit) if limit else None
+        doc_iterator = utils.create_iterator(
+            enumerate(self.eval_docs),
+            rank=int(rank),
+            limit=limit,
+            world_size=int(world_size),
+        )
+        return doc_iterator
+class ConfigurableTask(Task):
+    VERSION = "Yaml"
+    OUTPUT_TYPE = None
+    CONFIG = None
+    def __init__(
+        self,
+        data_dir=None,
+        cache_dir=None,
+        download_mode=None,
+        config: Optional[dict] = None,
+    ) -> None:  # TODO no super() call here
+        # Get pre-configured attributes
+        self._config = self.CONFIG
+        # Use new configurations if there was no preconfiguration
+        if self.config is None:
+            self._config = TaskConfig(**config)
+        # Overwrite configs
+        else:
+            if config is not None:
+                self._config.__dict__.update(config)
+        if self.config is None:
+            raise ValueError(
+                "Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg"
+            )
+        if isinstance(self.config.metadata, dict):
+            if "version" in self.config.metadata:
+                self.VERSION = self.config.metadata["version"]
+        if self.config.output_type is not None:
+            if self.config.output_type not in ALL_OUTPUT_TYPES:
+                raise ValueError(
+                    f"Got invalid output_type '{self.config.output_type}', must be in '{','.join(ALL_OUTPUT_TYPES)}'"
+                )
+            self.OUTPUT_TYPE = self.config.output_type
+        if self.config.dataset_path is not None:
+            self.DATASET_PATH = self.config.dataset_path
+        if self.config.dataset_name is not None:
+            self.DATASET_NAME = self.config.dataset_name
+        self._metric_fn_list = {}
+        self._metric_fn_kwargs = {}
+        self._aggregation_list = {}
+        self._higher_is_better = {}
+        if self.config.metric_list is None:
+            # TODO: handle this in TaskConfig.__post_init__ ?
+            _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
+            for metric_name in _metric_list:
+                self._metric_fn_list[metric_name] = get_metric(metric_name)
+                self._metric_fn_kwargs[metric_name] = {}
+                self._aggregation_list[metric_name] = get_metric_aggregation(
+                    metric_name
+                )
+                self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        else:
+            for metric_config in self.config.metric_list:
+                if "metric" not in metric_config:
+                    raise ValueError(
+                        "'metric' key not provided for an entry in 'metric_list', must be specified!"
+                    )
+                metric_name = metric_config["metric"]
+                kwargs = {
+                    key: metric_config[key]
+                    for key in metric_config
+                    if key
+                    not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
+                }
+                hf_evaluate_metric = (
+                    "hf_evaluate" in metric_config
+                    and metric_config["hf_evaluate"] is True
+                )
+                if self.config.process_results is not None:
+                    self._metric_fn_list[metric_name] = None
+                    self._metric_fn_kwargs[metric_name] = {}
+                elif callable(metric_name):
+                    metric_fn = metric_name.__call__
+                    metric_name = metric_name.__name__
+                    self._metric_fn_list[metric_name] = metric_fn
+                    self._metric_fn_kwargs[metric_name] = kwargs
+                else:
+                    self._metric_fn_list[metric_name] = get_metric(
+                        metric_name, hf_evaluate_metric
+                    )
+                    self._metric_fn_kwargs[metric_name] = kwargs
+                if "aggregation" in metric_config:
+                    agg_name = metric_config["aggregation"]
+                    if isinstance(agg_name, str):
+                        self._aggregation_list[metric_name] = get_aggregation(agg_name)
+                    elif callable(agg_name):  # noqa: E721
+                        self._aggregation_list[metric_name] = metric_config[
+                            "aggregation"
+                        ]
+                else:
+                    INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
+                    metric_agg = get_metric_aggregation(metric_name)
+                    eval_logger.warning(
+                        f"[Task: {self.config.task}] metric {metric_name} is defined, but aggregation is not. "
+                        f"using default "
+                        f"aggregation={INV_AGG_REGISTRY[metric_agg]}"
+                    )
+                    self._aggregation_list[metric_name] = metric_agg
+                if "higher_is_better" in metric_config:
+                    self._higher_is_better[metric_name] = metric_config[
+                        "higher_is_better"
+                    ]
+                else:
+                    eval_logger.warning(
+                        f"[Task: {self.config.task}] metric {metric_name} is defined, but higher_is_better is not. "
+                        f"using default "
+                        f"higher_is_better={is_higher_better(metric_name)}"
+                    )
+                    self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        self.download(self.config.dataset_kwargs)
+        self._training_docs = None
+        self._fewshot_docs = None
+        if self.config.filter_list is not None:
+            self._filters = []
+            for filter_config in self.config.filter_list:
+                filter_name = filter_config["name"]
+                filter_functions = filter_config["filter"]
+                components = []
+                for function in filter_functions:
+                    kwargs = {
+                        key: function[key] for key in function if key != "function"
+                    }
+                    components.append([function["function"], kwargs])
+                filter_pipeline = build_filter_ensemble(filter_name, components)
+                self._filters.append(filter_pipeline)
+        else:
+            self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        if self.config.use_prompt is not None:
+            eval_logger.info(f"loading prompt {self.config.use_prompt}")
+            self.prompt = get_prompt(
+                self.config.use_prompt, self.DATASET_PATH, self.DATASET_NAME
+            )
+        else:
+            self.prompt = None
+        if self.fewshot_docs() is not None:
+            self.fewshot_rnd = (
+                random.Random()
+            )  # setting with no seed, to be overridden at a later time
+            config_sampler: Union[str, Callable] = (
+                self.config.fewshot_config.get("sampler", "default")
+                if self.config.fewshot_config
+                else "default"
+            )
+            if isinstance(config_sampler, str):
+                self.sampler = samplers.get_sampler(config_sampler)(
+                    list(self.fewshot_docs()), self, rnd=self.fewshot_rnd
+                )
+            elif callable(config_sampler) and issubclass(
+                config_sampler, samplers.ContextSampler
+            ):
+                self.sampler = config_sampler(
+                    docs=list(self.fewshot_docs()), task=self, rnd=self.fewshot_rnd
+                )
+            else:
+                raise TypeError(
+                    f"fewshot_config.sampler should be a string or callable of ContextSampler type, "
+                    f"not {type(config_sampler)}"
+                )
+        self.task_docs = self.eval_docs
+        # Test One Doc
+        self.features = list(self.task_docs.features.keys())
+        self.multiple_input = 0
+        self.multiple_target = 0
+        test_doc = self.task_docs[0]
+        test_text = self.doc_to_text(test_doc)
+        test_target = self.doc_to_target(test_doc)
+        if self.config.doc_to_choice is not None:
+            test_choice = self.doc_to_choice(test_doc)
+            if not isinstance(test_choice, list):
+                eval_logger.error("doc_to_choice must return list")
+            else:
+                num_choice = len(test_choice)
+            if isinstance(test_text, int):
+                self.multiple_input = num_choice
+        else:
+            test_choice = None
+        if isinstance(test_target, list):
+            self.multiple_target = len(test_target)
+        else:
+            if (isinstance(test_target, int)) and (test_choice is not None):
+                test_target = test_choice[test_target]
+            else:
+                test_target = str(test_target)
+        if test_choice is not None:
+            check_choices = test_choice
+        else:
+            check_choices = [test_target]
+        if self.config.doc_to_choice is not None:
+            for choice in check_choices:
+                choice_has_whitespace = True if choice[0].isspace() else False
+                delimiter_has_whitespace = (
+                    True
+                    if self.config.target_delimiter.rstrip()
+                    != self.config.target_delimiter
+                    else False
+                )
+                if delimiter_has_whitespace and choice_has_whitespace:
+                    eval_logger.debug(
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" have whitespace'
+                    )
+                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
+                    eval_logger.debug(
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+                    )
+    def download(self, dataset_kwargs: Optional[Dict[str, Any]] = None) -> None:
+        self.dataset = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            **dataset_kwargs if dataset_kwargs is not None else {},
+        )
+    def has_training_docs(self) -> bool:
+        if self.config.training_split is not None:
+            return True
+        else:
+            return False
+    def has_validation_docs(self) -> bool:
+        if self.config.validation_split is not None:
+            return True
+        else:
+            return False
+    def has_test_docs(self) -> bool:
+        if self.config.test_split is not None:
+            return True
+        else:
+            return False
+    def training_docs(self) -> datasets.Dataset:
+        if self.has_training_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.training_split]
+                )
+            return self.dataset[self.config.training_split]
+    def validation_docs(self) -> datasets.Dataset:
+        if self.has_validation_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.validation_split]
+                )
+            return self.dataset[self.config.validation_split]
+    def test_docs(self) -> datasets.Dataset:
+        if self.has_test_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.test_split])
+            return self.dataset[self.config.test_split]
+    def fewshot_docs(self):
+        if self.config.fewshot_split is not None:
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.fewshot_split])
+            return self.dataset[self.config.fewshot_split]
+        elif (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("samples", None) is not None
+        ):
+            if isinstance(self.config.fewshot_config["samples"], list):
+                return self.config.fewshot_config["samples"]
+            elif callable(self.config.fewshot_config["samples"]):
+                return self.config.fewshot_config["samples"]()
+            else:
+                raise Exception(
+                    "`fewshot_config['samples']` was incorrectly defined in the configuration. It should be either a list of samples as a dict, or function returning this list."
+                )
+        else:
+            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
+                eval_logger.warning(
+                    f"[Task: {self.config.task}] "
+                    "num_fewshot > 0 but fewshot_split is None. "
+                    "using preconfigured rule."
+                )
+            return super().fewshot_docs()
+    @staticmethod
+    def append_target_question(
+        labeled_examples: List[Dict[str, str]],
+        question: str,
+        fewshot_as_multiturn: bool = False,
+    ) -> None:
+        """Adds a target question to the labeled examples list.
+        If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
+        Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
+        """
+        if not fewshot_as_multiturn:
+            # if no messages or last message is system, append as new user entry
+            if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
+                labeled_examples.append({"role": "user", "content": question})
+            # if last message is user, append to it to avoid two user messages in a row
+            else:
+                labeled_examples[-1]["content"] += question
+        else:
+            # if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
+            labeled_examples.append({"role": "user", "content": question})
+    @utils.positional_deprecated
+    def fewshot_context(
+        self,
+        doc: str,
+        num_fewshot: int,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+    ) -> str:
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param  system_instruction: str
+            System instruction to be applied to the prompt.
+        :param apply_chat_template: bool
+            Whether to apply the chat template to the fewshot context.
+        :param fewshot_as_multiturn: bool
+            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+        :param chat_template: Callable
+            Chat template to be applied to the fewshot context.
+        :returns: str
+            The fewshot context.
+        """
+        if apply_chat_template:
+            labeled_examples = []
+        else:
+            labeled_examples = ""
+        # get task description
+        if description := self.config.description:
+            description = utils.apply_template(self.config.description, doc)
+        # create system prompt based on the provided system instruction and description
+        if system_instruction is not None and description:
+            system_prompt = (
+                f"{system_instruction}{self.sampler.fewshot_delimiter}{description}"
+            )
+        elif system_instruction is not None:
+            system_prompt = system_instruction
+        elif description:
+            system_prompt = description
+        else:
+            system_prompt = ""
+        # add system prompt if specified
+        if system_prompt:
+            if apply_chat_template:
+                labeled_examples.append({"role": "system", "content": system_prompt})
+            else:
+                labeled_examples = system_prompt
+        # if few-shot - append examples after the system prompt
+        if num_fewshot > 0:
+            if apply_chat_template:
+                labeled_examples.extend(
+                    self.sampler.get_chat_context(
+                        doc, num_fewshot, fewshot_as_multiturn
+                    )
+                )
+            else:
+                labeled_examples += self.sampler.get_context(doc, num_fewshot)
+        example = self.doc_to_text(doc)
+        if apply_chat_template:
+            if self.multiple_input:
+                return chat_template(labeled_examples)
+            if isinstance(example, str):
+                self.append_target_question(
+                    labeled_examples, example, fewshot_as_multiturn
+                )
+            # for loglikelihood create a list of questions with appended choices
+            elif isinstance(example, list):
+                labeled_examples_list = []
+                # copy chat history for each example and append the answer
+                for ex in example:
+                    chat = deepcopy(labeled_examples)
+                    self.append_target_question(chat, ex, fewshot_as_multiturn)
+                    labeled_examples_list.append(chat_template(chat))
+                return labeled_examples_list
+            # if example is an integer, append the choice or convert to string
+            elif isinstance(example, int):
+                if self.config.doc_to_choice is not None:
+                    choices = self.doc_to_choice(doc)
+                    self.append_target_question(
+                        labeled_examples, choices[example], fewshot_as_multiturn
+                    )
+                else:
+                    self.append_target_question(
+                        labeled_examples, str(example), fewshot_as_multiturn
+                    )
+                # return lm.apply_chat_template(labeled_examples)
+            return chat_template(labeled_examples)
+        else:
+            if self.multiple_input:
+                return labeled_examples
+            if isinstance(example, str):
+                return labeled_examples + example
+            elif isinstance(example, list):
+                return [labeled_examples + ex for ex in example]
+            elif isinstance(example, int):
+                if self.config.doc_to_choice is not None:
+                    choices = self.doc_to_choice(doc)
+                    return labeled_examples + choices[example]
+                else:
+                    return labeled_examples + str(example)
+    def apply_filters(self):
+        """Iterates over FilterEnsembles and applies them to instances"""
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+    def should_decontaminate(self):
+        return self.config.should_decontaminate
+    def doc_to_decontamination_query(self, doc):
+        if self.config.should_decontaminate:
+            if self.config.doc_to_decontamination_query is None:
+                return self.doc_to_text(doc)
+            else:
+                doc_to_decontamination_query = self.config.doc_to_decontamination_query
+                if doc_to_decontamination_query in self.features:
+                    return doc[doc_to_decontamination_query]
+                elif callable(doc_to_decontamination_query):
+                    return doc_to_decontamination_query(doc)
+                else:
+                    return ast.literal_eval(
+                        utils.apply_template(
+                            self.config.doc_to_decontamination_query, doc
+                        )
+                    )
+    def _process_doc(self, doc: dict) -> dict:
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+    def doc_to_text(self, doc, doc_to_text=None):
+        if self.prompt is not None:
+            doc_to_text = self.prompt
+        elif doc_to_text is not None:
+            doc_to_text = doc_to_text
+        else:
+            doc_to_text = self.config.doc_to_text
+        if isinstance(doc_to_text, int):
+            return doc_to_text
+        elif isinstance(doc_to_text, str):
+            if doc_to_text in self.features:
+                # if self.config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
+                # else:
+                return doc[doc_to_text]
+            else:
+                text_string = utils.apply_template(doc_to_text, doc)
+                if text_string.isdigit() and self._config.doc_to_choice is not None:
+                    return ast.literal_eval(text_string)
+                else:
+                    return text_string
+        elif callable(doc_to_text):
+            return doc_to_text(doc)
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_text, "apply"):
+            applied_prompt = doc_to_text.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[0]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self.config.fewshot_delimiter
+        else:
+            print(type(doc_to_text))
+            raise TypeError
+    def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
+        if self.prompt is not None:
+            doc_to_target = self.prompt
+        elif doc_to_target is not None:
+            doc_to_target = doc_to_target
+        else:
+            doc_to_target = self.config.doc_to_target
+        if isinstance(doc_to_target, int):
+            return doc_to_target
+        elif isinstance(doc_to_target, str):
+            if doc_to_target in self.features:
+                # if self.config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
+                # else:
+                return doc[doc_to_target]
+            else:
+                target_string = utils.apply_template(doc_to_target, doc)
+                if target_string.isdigit() and self._config.doc_to_choice is not None:
+                    return ast.literal_eval(target_string)
+                elif (
+                    len(target_string) >= 2
+                    and (target_string[0] == "[")
+                    and (target_string[-1] == "]")
+                ):
+                    try:
+                        return ast.literal_eval(target_string)
+                    except (SyntaxError, ValueError):
+                        return target_string
+                else:
+                    return target_string
+        elif isinstance(doc_to_target, list):
+            return doc_to_target
+        elif callable(doc_to_target):
+            return doc_to_target(doc)
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_target, "apply"):
+            applied_prompt = doc_to_target.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[1]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self.config.fewshot_delimiter
+        else:
+            raise TypeError
+    def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
+        if self.prompt is not None:
+            doc_to_choice = self.prompt
+        elif doc_to_choice is not None:
+            doc_to_choice = doc_to_choice
+        elif self.config.doc_to_choice is None:
+            eval_logger.error("doc_to_choice was called but not set in config")
+        else:
+            doc_to_choice = self.config.doc_to_choice
+        if isinstance(doc_to_choice, str):
+            if doc_to_choice in self.features:
+                return doc[doc_to_choice]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
+        elif isinstance(doc_to_choice, list):
+            return doc_to_choice
+        elif isinstance(doc_to_choice, dict):
+            return list(doc_to_choice.values())
+        elif callable(doc_to_choice):
+            return doc_to_choice(doc)
+        elif hasattr(doc_to_choice, "get_answer_choices_list"):
+            return doc_to_choice.get_answer_choices_list(doc)
+        else:
+            raise TypeError
+    def construct_requests(
+        self, doc: dict, ctx: str, **kwargs
+    ) -> Union[List[Instance], Instance]:
+        if self.OUTPUT_TYPE == "loglikelihood":
+            arguments = (ctx, self.doc_to_target(doc))
+        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
+            arguments = (self.doc_to_target(doc),)
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            choices = self.doc_to_choice(doc)
+            target_delimiter = self.config.target_delimiter
+            if self.multiple_input:
+                # If there are multiple inputs, choices are placed in the ctx
+                cont = self.doc_to_target(doc)
+                arguments = [
+                    (ctx + choice, f"{target_delimiter}{cont}") for choice in choices
+                ]
+            else:
+                # Otherwise they are placed in the continuation
+                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
+            request_list = [
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=arg,
+                    idx=i,
+                    **kwargs,
+                )
+                for i, arg in enumerate(arguments)
+            ]
+            # TODO: we should raise a warning telling users this will at most ~2x runtime.
+            if "acc_mutual_info" in self._metric_fn_list.keys():
+                # if we are calculating multiple choice accuracy
+                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
+                # here mutual info refers to calculating
+                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
+                # in other words normalizing by subtracting the unconditional logprob of each choice.
+                request_list.extend(
+                    [
+                        Instance(
+                            request_type="loglikelihood",
+                            doc=doc,
+                            arguments=("", "{}".format(choice)),
+                            idx=i,
+                            **kwargs,
+                        )
+                        for i, choice in enumerate(choices)
+                    ]
+                )
+            return request_list
+        elif self.OUTPUT_TYPE == "generate_until":
+            arguments = (ctx, deepcopy(self.config.generation_kwargs))
+        return Instance(
+            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
+        )
+    def process_results(self, doc, results):
+        if callable(self.config.process_results):
+            return self.config.process_results(doc, results)
+        result_dict = {}
+        use_metric = list(self._metric_fn_list.keys())
+        if self.OUTPUT_TYPE == "loglikelihood":
+            results = results[0]
+            ll, is_greedy = results
+            return {
+                **({"perplexity": ll} if "perplexity" in use_metric else {}),
+                **({"acc": int(is_greedy)} if "acc" in use_metric else {}),
+            }
+        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
+            (loglikelihood,) = results
+            _words = self.count_words(self.doc_to_target(doc))
+            _bytes = self.count_bytes(self.doc_to_target(doc))
+            return {
+                **(
+                    {"word_perplexity": (loglikelihood, _words)}
+                    if "word_perplexity" in use_metric
+                    else {}
+                ),
+                **(
+                    {"byte_perplexity": (loglikelihood, _bytes)}
+                    if "byte_perplexity" in use_metric
+                    else {}
+                ),
+                **(
+                    {"bits_per_byte": (loglikelihood, _bytes)}
+                    if "bits_per_byte" in use_metric
+                    else {}
+                ),
+            }
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            lls, is_greedy = zip(*results)
+            # retrieve choices in List[str] form, to compute choice lengths, etc.
+            choices = self.doc_to_choice(doc)
+            completion_len = np.array([float(len(i)) for i in choices])
+            if (
+                2 * len(choices) == len(lls)
+                and "acc_mutual_info" in self._metric_fn_list.keys()
+            ):
+                # then we are doing mutual info.
+                # this stores the "dryrun" / unconditional answer loglikelihoods
+                lls_unconditional = lls[1::2]
+                if len(lls_unconditional) != len(choices):
+                    raise ValueError
+                # and this stores our "regular" conditional loglikelihoods
+                lls = lls[::2]
+            pred = np.argmax(lls)
+            pred_norm = np.argmax(lls / completion_len)
+            if self.multiple_input:
+                gold = self.doc_to_text(doc)
+            else:
+                gold = self.doc_to_target(doc)
+            gold_index_error = False
+            if isinstance(gold, list):
+                gold = [i if i < len(choices) else -100 for i in gold]
+                if -100 in gold:
+                    gold_index_error = True
+            else:
+                if isinstance(gold, int):
+                    gold = gold if gold < len(choices) else -100
+                elif isinstance(gold, str):
+                    gold = choices.index(gold) if gold in choices else -100
+                if gold == -100:
+                    gold_index_error = True
+            if gold_index_error:
+                eval_logger.warning(
+                    f"Label index was not in within range of available choices,"
+                    f"Sample:\n\n{doc}\n\n"
+                )
+            if self.multiple_target:
+                acc = 1.0 if pred in gold else 0.0
+                acc_norm = 1.0 if pred_norm in gold else 0.0
+                exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
+            else:
+                acc = 1.0 if pred == gold else 0.0
+                acc_norm = 1.0 if pred_norm == gold else 0.0
+                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
+                exact_match = int(is_greedy[gold]) if gold != -100 else 0
+            prob_norm = utils.softmax(lls)
+            # TODO use keyword arguments to the metric?
+            # gold, pred, norm stuff, the original lls,
+            result_dict = {
+                **({"acc": acc} if "acc" in use_metric else {}),
+                **({"f1": (gold, pred)} if "f1" in use_metric else {}),
+                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
+                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
+                **(
+                    {"brier_score": (gold, prob_norm)}
+                    if "brier_score" in use_metric
+                    else {}
+                ),
+            }
+            if "acc_mutual_info" in use_metric:
+                lls_mutual_info = [
+                    ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)
+                ]
+                acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
+                result_dict["acc_mutual_info"] = acc_mutual_info
+        elif self.OUTPUT_TYPE == "generate_until":
+            gold = self.doc_to_target(doc)
+            result = results[0]
+            if self.config.doc_to_choice is not None:
+                # If you set doc_to_choice,
+                # it assumes that doc_to_target returns a number.
+                choices = self.doc_to_choice(doc)
+                gold = choices[gold]
+            # we expect multiple_targets to be a list.
+            elif self.multiple_target:
+                gold = list(gold)
+            elif type(gold) != type(result):
+                # cast gold to the same type as result
+                gold = type(result)(gold)
+            for metric in self._metric_fn_list.keys():
+                if self.multiple_target:
+                    # in the case where we have multiple targets,
+                    # return true if any are true
+                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
+                    scores = []
+                    if not isinstance(gold, list):
+                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
+                        # print(gold)
+                        gold = [gold]
+                    if metric == "exact_match":
+                        result = [result for _ in range(len(gold))]
+                        scores = self._metric_fn_list[metric](
+                            references=gold,
+                            predictions=result,
+                            **self._metric_fn_kwargs[metric],
+                        )[metric]
+                        result_score = 1.0 if scores > 0.0 else 0.0
+                    else:
+                        for gold_option in gold:
+                            try:
+                                result_score = self._metric_fn_list[metric](
+                                    references=[gold_option],
+                                    predictions=[result],
+                                    **self._metric_fn_kwargs[metric],
+                                )
+                            except (
+                                TypeError
+                            ):  # TODO: this is hacky and I don't want to do it
+                                result_score = self._metric_fn_list[metric](
+                                    [gold_option, result]
+                                )
+                            if isinstance(result_score, dict):
+                                # TODO: this handles the case where HF evaluate returns a dict.
+                                result_score = result_score[metric]
+                            scores.append(result_score)
+                        if any(scores):
+                            result_score = 1.0
+                        else:
+                            result_score = 0.0
+                else:
+                    try:
+                        result_score = self._metric_fn_list[metric](
+                            references=[gold],
+                            predictions=[result],
+                            **self._metric_fn_kwargs[metric],
+                        )
+                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                        result_score = self._metric_fn_list[metric]([gold, result])
+                    if isinstance(result_score, dict):
+                        # TODO: this handles the case where HF evaluate returns a dict.
+                        result_score = result_score[metric]
+                result_dict[metric] = result_score
+        else:
+            raise ValueError(
+                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
+                "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'",
+            )
+        return result_dict
+    def aggregation(self) -> dict:
+        return self._aggregation_list
+    def higher_is_better(self) -> dict:
+        return self._higher_is_better
+    def get_config(self, key: str) -> Any:
+        return getattr(self._config, key, None)
+    @property
+    def task_name(self) -> Any:
+        return getattr(self.config, "task", None)
+    def __repr__(self):
+        return (
+            f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
+            f"output_type={self.OUTPUT_TYPE},"
+            f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
+            f"num_samples={len(self.eval_docs)})"
+        )
+class MultipleChoiceTask(Task):
+    OUTPUT_TYPE = "loglikelihood"
+    def doc_to_target(self, doc: dict) -> str:
+        return " " + doc["choices"][doc["gold"]]
+    def construct_requests(self, doc: dict, ctx: str, **kwargs) -> List[Instance]:
+        # TODO: add mutual info here?
+        return [
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " {}".format(choice)),
+                idx=i,
+                **kwargs,
+            )
+            for i, choice in enumerate(doc["choices"])
+        ]
+    def process_results(self, doc: dict, results: Iterable[Tuple[float, bool]]) -> dict:
+        results = [
+            res[0] for res in results
+        ]  # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere?
+        gold = doc["gold"]
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+        }
+    def higher_is_better(self) -> dict:
+        return {
+            "acc": True,
+            "acc_norm": True,
+        }
+    def aggregation(self) -> dict:
+        return {
+            "acc": mean,
+            "acc_norm": mean,
+        }
+class PerplexityTask(Task):
+    OUTPUT_TYPE = "loglikelihood_rolling"
+    def has_training_docs(self) -> bool:
+        return False
+    def fewshot_examples(self, k: int, rnd) -> List:
+        if k != 0:
+            raise ValueError(
+                "The number of fewshot examples must be 0 for perplexity tasks."
+            )
+        return []
+    def fewshot_context(self, doc: dict, num_fewshot: int) -> Literal[""]:
+        if num_fewshot != 0:
+            raise ValueError(
+                "The number of fewshot examples must be 0 for perplexity tasks."
+            )
+        return ""
+    def higher_is_better(self) -> dict:
+        return {
+            "word_perplexity": False,
+            "byte_perplexity": False,
+            "bits_per_byte": False,
+        }
+    def doc_to_decontamination_query(self, doc):
+        return doc
+    def doc_to_text(self, doc) -> str:
+        return ""
+    def doc_to_target(self, doc):
+        return doc
+    def construct_requests(self, doc: dict, ctx: Optional[str], **kwargs):
+        if bool(ctx):
+            raise ValueError
+        return Instance(
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=(self.doc_to_target(doc),),
+            idx=0,
+            **kwargs,
+        )
+    def process_results(self, doc: dict, results: Tuple[float]) -> dict:
+        (loglikelihood,) = results
+        words = self.count_words(self.doc_to_target(doc))
+        bytes_ = self.count_bytes(self.doc_to_target(doc))
+        return {
+            "word_perplexity": (loglikelihood, words),
+            "byte_perplexity": (loglikelihood, bytes_),
+            "bits_per_byte": (loglikelihood, bytes_),
+        }
+    def aggregation(self) -> dict:
+        return {
+            "word_perplexity": weighted_perplexity,
+            "byte_perplexity": weighted_perplexity,
+            "bits_per_byte": bits_per_byte,
+        }
+    @classmethod
+    def count_bytes(cls, doc) -> int:
+        return len(doc.encode("utf-8"))
+    @classmethod
+    def count_words(cls, doc) -> int:
+        """Downstream tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))

scripts/yans/lm-evaluation-harness/lm_eval/models/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from . import (
+    anthropic_llms,
+    api_models,
+    dummy,
+    gguf,
+    huggingface,
+    mamba_lm,
+    nemo_lm,
+    neuralmagic,
+    neuron_optimum,
+    openai_completions,
+    optimum_lm,
+    textsynth,
+    vllm_causallms,
+)
+# TODO: implement __all__
+try:
+    # enable hf hub transfer if available
+    import hf_transfer  # type: ignore # noqa
+    import huggingface_hub.constants  # type: ignore
+    huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+except ImportError:
+    pass

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (631 Bytes). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/anthropic_llms.cpython-310.pyc ADDED Viewed

Binary file (11 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/api_models.cpython-310.pyc ADDED Viewed

Binary file (16.6 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc ADDED Viewed

Binary file (1.58 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/gguf.cpython-310.pyc ADDED Viewed

Binary file (4.11 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc ADDED Viewed

Binary file (29.8 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/mamba_lm.cpython-310.pyc ADDED Viewed

Binary file (3.69 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/nemo_lm.cpython-310.pyc ADDED Viewed

Binary file (13.7 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuralmagic.cpython-310.pyc ADDED Viewed

Binary file (11 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuron_optimum.cpython-310.pyc ADDED Viewed

Binary file (18.3 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/openai_completions.cpython-310.pyc ADDED Viewed

Binary file (6.39 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/optimum_lm.cpython-310.pyc ADDED Viewed

Binary file (2.65 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc ADDED Viewed

Binary file (5.23 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (21.3 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/vllm_causallms.cpython-310.pyc ADDED Viewed

Binary file (14.3 kB). View file

scripts/yans/lm-evaluation-harness/lm_eval/models/anthropic_llms.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import os
+from functools import cached_property
+from typing import Any, Dict, List, Tuple, Union
+from tqdm import tqdm
+from lm_eval import utils
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.openai_completions import LocalCompletionsAPI
+from lm_eval.models.utils import retry_on_specific_exceptions
+eval_logger = utils.eval_logger
+def anthropic_completion(
+    client,  #: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens_to_sample: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
+) -> str:
+    """Wrapper function around the Anthropic completion API client with exponential back-off
+    in case of RateLimitError.
+    params:
+        client: anthropic.Anthropic
+            Anthropic API client
+        model: str
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        prompt: str
+            Prompt to feed to the model
+        max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        temperature: float
+            Sampling temperature
+        stop: List[str]
+            List of stop sequences
+        kwargs: Any
+            Additional model_args to pass to the API client
+    """
+    try:
+        import anthropic
+    except ModuleNotFoundError:
+        raise Exception(
+            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+        )
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+    @retry_on_specific_exceptions(
+        on_exceptions=[anthropic.RateLimitError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        response = client.completions.create(
+            prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
+            model=model,
+            # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
+            #       (e.g. gsm8k's ":") may truncate a lot of the input.
+            stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
+            max_tokens_to_sample=max_tokens_to_sample,
+            temperature=temperature,
+            **kwargs,
+        )
+        return response.completion
+    return completion()
+def anthropic_chat(
+    client,  #: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
+) -> str:
+    """Wrapper function around the Anthropic completion API client with exponential back-off
+    in case of RateLimitError.
+    params:
+        client: anthropic.Anthropic
+            Anthropic API client
+        model: str
+            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
+        prompt: str
+            Prompt to feed to the model
+        max_tokens: int
+            Maximum number of tokens to sample from the model
+        temperature: float
+            Sampling temperature
+        stop: List[str]
+            List of stop sequences
+        kwargs: Any
+            Additional model_args to pass to the API client
+    """
+    try:
+        import anthropic
+    except ModuleNotFoundError:
+        raise Exception(
+            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+        )
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+    @retry_on_specific_exceptions(
+        on_exceptions=[
+            anthropic.RateLimitError,
+            anthropic.APIConnectionError,
+            anthropic.APIStatusError,
+        ],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def messages():
+        response = client.messages.create(
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[{"role": "user", "content": f"{prompt}"}],
+            **kwargs,
+        )
+        return response.content[0].text
+    return messages()
+@register_model("anthropic-completions")
+class AnthropicLM(LM):
+    REQ_CHUNK_SIZE = 20  # TODO: not used
+    def __init__(
+        self,
+        batch_size: int = 1,
+        model: str = "claude-2.0",
+        max_tokens_to_sample: int = 256,
+        temperature: float = 0,  # defaults to 1
+        **kwargs,  # top_p, top_k, etc.
+    ) -> None:
+        """Anthropic API wrapper.
+        :param model: str
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        :param max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        :param temperature: float
+            Sampling temperature
+        :param kwargs: Any
+            Additional model_args to pass to the API client
+        """
+        super().__init__()
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+        self.model = model
+        # defaults to os.environ.get("ANTHROPIC_API_KEY")
+        self.client = anthropic.Anthropic()
+        self.temperature = temperature
+        self.max_tokens_to_sample = max_tokens_to_sample
+        self.tokenizer = self.client.get_tokenizer()
+        self.kwargs = kwargs
+    @property
+    def eot_token_id(self):
+        # Not sure but anthropic.HUMAN_PROMPT ?
+        raise NotImplementedError("No idea about anthropic tokenization.")
+    @property
+    def max_length(self) -> int:
+        return 2048
+    @property
+    def max_gen_toks(self) -> int:
+        return self.max_tokens_to_sample
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string).ids
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
+    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+        if not requests:
+            return []
+        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
+        res = []
+        for request in tqdm(_requests, disable=disable_tqdm):
+            try:
+                inp = request[0]
+                request_args = request[1]
+                # generation_kwargs
+                until = request_args.get("until")
+                max_gen_toks = request_args.get("max_gen_toks", self.max_length)
+                temperature = request_args.get("temperature", self.temperature)
+                response = anthropic_completion(
+                    client=self.client,
+                    model=self.model,
+                    prompt=inp,
+                    max_tokens_to_sample=max_gen_toks,
+                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
+                    stop=until,  # type: ignore
+                    **self.kwargs,
+                )
+                res.append(response)
+                self.cache_hook.add_partial("generate_until", request, response)
+            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"Server unreachable: {e.__cause__}")
+                break
+            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"API error {e.status_code}: {e.message}")
+                break
+        return res
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+@register_model("anthropic-chat", "anthropic-chat-completions")
+class AnthropicChat(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url="https://api.anthropic.com/v1/messages",
+        tokenizer_backend=None,
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+        eval_logger.warning(
+            "Chat completions does not support batching. Defaulting to batch size 1."
+        )
+        self._batch_size = 1
+        self.anthropic_version = "2023-06-01"
+        eval_logger.warning(
+            f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
+        )
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("ANTHROPIC_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the ANTHROPIC_API_KEY environment variable."
+            )
+        return key
+    @cached_property
+    def header(self):
+        return {
+            "x-api-key": f"{self.api_key}",
+            "anthropic-version": self.anthropic_version,
+        }
+    def _create_payload(
+        self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs
+    ) -> dict:
+        system = (
+            messages[0].get("content") if messages[0].get("role") == "system" else None
+        )
+        if system:
+            messages = messages[1:]
+        gen_kwargs.pop("do_sample", False)
+        max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = gen_kwargs.pop("until", ["\n\nHuman:"])
+        if not isinstance(stop, list):
+            stop = [stop]
+        out = {
+            "messages": messages,
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stop_sequences": stop,
+            **gen_kwargs,
+        }
+        if system:
+            out["system"] = system
+        return out
+    def parse_generations(
+        self, outputs: Union[Dict, List[Dict]], **kwargs
+    ) -> List[str]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["content"]:
+                res.append(choices["text"])
+        return res
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len=None,
+        add_special_tokens=None,
+        **kwargs,
+    ) -> List[str]:
+        return [string]
+    def loglikelihood(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Anthropic Chat Completions API does not support the return of loglikelihood"
+        )

scripts/yans/lm-evaluation-harness/lm_eval/models/api_models.py ADDED Viewed

	@@ -0,0 +1,641 @@

+import abc
+import asyncio
+import copy
+import itertools
+import json
+from functools import cached_property
+from typing import (
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Union,
+)
+try:
+    import requests
+    from aiohttp import ClientSession, TCPConnector
+    from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
+    from tqdm import tqdm
+    from tqdm.asyncio import tqdm_asyncio
+except ModuleNotFoundError:
+    pass
+from importlib.util import find_spec
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.models.utils import Collator, chunks, configure_pad_token
+LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
+# utility class to keep track of json encoded chats
+class JsonChatStr(NamedTuple):
+    prompt: str
+    def encode(self, encoding):
+        return self.prompt.encode(encoding)
+eval_logger = utils.eval_logger
+class TemplateAPI(TemplateLM):
+    def __init__(
+        self,
+        model: str = None,
+        pretrained: str = None,  # `model` takes precedence over `pretrained` when passed.
+        base_url: str = None,
+        tokenizer: Optional[str] = None,
+        # Logliklehood tasks require a tokenizer to calculate context lengths,
+        # however the requests can be sent as a string if the API doesn't support token inputs.
+        # use tokenized_requests=False
+        tokenizer_backend: Optional[
+            Literal["tiktoken", "huggingface", None]
+        ] = "huggingface",
+        truncate: bool = False,
+        # number of concurrent requests. More useful if not batching
+        num_concurrent: int = 1,
+        max_retries: int = 3,
+        max_gen_toks: int = 256,
+        batch_size: Union[str, int] = 1,
+        seed: int = 1234,
+        max_length: Optional[int] = 2048,
+        add_bos_token: bool = False,
+        custom_prefix_token_id=None,
+        # send the requests as tokens or strings
+        tokenized_requests=True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        missing_packages = [
+            pkg
+            for pkg in ["aiohttp", "tqdm", "tenacity", "requests"]
+            if find_spec(pkg) is None
+        ]
+        if missing_packages:
+            raise ModuleNotFoundError(
+                f"Attempted to use an API model, but the required packages {missing_packages} are not installed. "
+                'Please install these via `pip install lm-eval[api]` or `pip install -e ."[api]"`'
+            )
+        self.model = model or pretrained
+        self.base_url = base_url
+        self.tokenizer = tokenizer
+        if not isinstance(batch_size, int) and "auto" in batch_size:
+            eval_logger.warning(
+                "Automatic batch size is not supported for API models. Defaulting to batch size 1."
+            )
+        elif int(batch_size) > 1:
+            eval_logger.warning(
+                "Batch size > 1 detected. Ensure your API supports batched requests with varying total sequence lengths."
+            )
+        self._batch_size = int(batch_size) if batch_size != "auto" else 1
+        self._truncate = truncate
+        self._max_gen_toks = int(max_gen_toks)
+        self._seed = int(seed)
+        self.max_length = max_length
+        if int(num_concurrent) <= 1:
+            eval_logger.info(
+                "Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1."
+            )
+        self._concurrent = int(num_concurrent)
+        self.tokenizer_backend = tokenizer_backend
+        self.add_bos_token = add_bos_token
+        self.custom_prefix_token_id = custom_prefix_token_id
+        self.tokenized_requests = tokenized_requests
+        self.max_retries = int(max_retries)
+        eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
+        if self.tokenizer_backend is None:
+            self.tokenizer = None
+            self.tokenized_requests = False
+        else:
+            if self.tokenizer is None:
+                if self.tokenizer_backend == "huggingface":
+                    import transformers
+                    self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                        self.tokenizer if self.tokenizer else self.model
+                    )
+                    # Not used as the API will handle padding but to mirror the behavior of the HFLM
+                    self.tokenizer = configure_pad_token(self.tokenizer)
+                elif self.tokenizer_backend == "tiktoken":
+                    try:
+                        import tiktoken
+                        self.tokenizer = tiktoken.encoding_for_model(self.model)
+                    except ModuleNotFoundError as e:
+                        raise Exception(
+                            "Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. "
+                            "Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`."
+                        ) from e
+                    if "openai" not in self.base_url:
+                        eval_logger.warning(
+                            f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
+                            "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
+                        )
+            else:
+                import transformers
+                assert isinstance(tokenizer, str), "tokenizer must be a string"
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    tokenizer,
+                )
+    @abc.abstractmethod
+    def _create_payload(
+        self,
+        messages: Union[List[List[int]], List[dict], List[str], str],
+        *,
+        generate: bool = True,
+        gen_kwargs: Optional[dict] = None,
+        seed: int = 1234,
+        **kwargs,
+    ) -> dict:
+        """This method is responsible for creating the json payload that will be sent to the API."""
+        raise NotImplementedError
+    def create_message(
+        self,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        generate=False,
+    ) -> Union[List[List[int]], List[dict], List[str], str]:
+        """Helper method to transform the prompt into the expected API input format. messages consist of batched requests"""
+        if isinstance(messages[0], JsonChatStr):
+            # for chat completions we need to decode the json string to list[dict,...]
+            assert (
+                self._batch_size == 1
+            ), "non-tokenized chat requests are only supported with batch_size=1"
+            # list[dict["role":..., "content":...],...]
+            return json.loads(messages[0].prompt)
+        if not self.tokenized_requests:
+            # if messages are tokenized:
+            if isinstance(messages[0][0], int):
+                # assuming decoding is lossless. However, this is only for logliklehood requests
+                # as we need to compute the context length. For generations, we don't need to tokenize.
+                messages = self.decode_batch(messages)
+            if self._batch_size <= 1:
+                # if batch is 1 return str
+                return messages[0]
+            else:
+                # list[str,...]
+                return messages
+        # list[list[int], ...]
+        return messages
+    @staticmethod
+    @abc.abstractmethod
+    def parse_logprobs(
+        outputs: Union[Any, List[Any]],
+        tokens: List[List[int]] = None,
+        ctxlen: List[int] = None,
+        **kwargs,
+    ) -> List[Tuple[float, bool]]:
+        """Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples"""
+        raise NotImplementedError
+    @staticmethod
+    @abc.abstractmethod
+    def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
+        """Method used to parse the generations from the (batched) API response. This method should return a list of str"""
+        raise NotImplementedError
+    @cached_property
+    def api_key(self) -> str:
+        """Override this property to return the API key for the API request."""
+        return ""
+    @cached_property
+    def header(self) -> dict:
+        """Override this property to return the headers for the API request."""
+        return {"Authorization": f"Bearer {self.api_key}"}
+    @property
+    def chat_template(self) -> str:
+        """Must be defined for LM subclasses that implement Chat Templating.
+        Should return the structure of the chat template applied to user/assistant messages.
+        Only used for logging and reproducibility.
+        """
+        return ""
+    @property
+    def tokenizer_name(self) -> str:
+        """Must be defined for LM subclasses which implement Chat Templating.
+        Should return the name of the tokenizer or chat template used.
+        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
+        """
+        return ""
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]]
+    ) -> Union[str, JsonChatStr]:
+        """Applies a chat template to a list of chat history between user and model."""
+        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
+            return self.tokenizer.apply_chat_template(
+                chat_history, tokenize=False, add_generation_prompt=True
+            )
+        else:
+            # bit of a hack. We'll load back before sending to the API
+            return JsonChatStr(json.dumps(chat_history))
+    @cached_property
+    def eot_token_id(self) -> Optional[int]:
+        if self.tokenizer is None:
+            return None
+        else:
+            if self.tokenizer_backend == "huggingface":
+                return self.tokenizer.eos_token_id
+            elif self.tokenizer_backend == "tiktoken":
+                return self.tokenizer.eot_token
+    @cached_property
+    def prefix_token_id(self) -> Optional[int]:
+        if self.tokenizer is None:
+            return None
+        else:
+            if self.custom_prefix_token_id is not None:
+                return self.custom_prefix_token_id
+            if self.tokenizer_backend == "huggingface":
+                if self.tokenizer.bos_token_id is not None:
+                    return self.tokenizer.bos_token_id
+                return self.tokenizer.eos_token_id
+            else:
+                return self.tokenizer.eot_token
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        **kwargs,
+    ) -> Union[List[List[int]], List[int], List[str]]:
+        if self.tokenizer_backend is None:
+            return [string]
+        elif self.tokenizer_backend == "huggingface":
+            # by default for CausalLM - false or self.add_bos_token is set
+            if not add_special_tokens:
+                add_special_tokens = False or self.add_bos_token
+            encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+                string,
+                add_special_tokens=add_special_tokens,
+                truncation=truncation,
+                return_attention_mask=False,
+            ).input_ids
+            # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+            if left_truncate_len:
+                if not isinstance(string, str):
+                    encoding = [enc[-left_truncate_len:] for enc in encoding]
+                else:
+                    encoding = encoding[-left_truncate_len:]
+            return encoding
+        else:
+            try:
+                encoding = self.tokenizer.encode(string)
+            except Exception:
+                encoding = self.tokenizer.encode_batch(string)
+            return encoding
+    def decode_batch(self, tokens: List[List[int]]) -> List[str]:
+        if self.tokenizer_backend == "huggingface":
+            return self.tokenizer.batch_decode(tokens)
+        elif self.tokenizer_backend == "tiktoken":
+            return self.tokenizer.decode_batch(tokens)
+    def model_call(
+        self,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        *,
+        generate: bool = True,
+        gen_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Optional[dict]:
+        # !!! Copy: shared dict for each request, need new object !!!
+        gen_kwargs = copy.deepcopy(gen_kwargs)
+        try:
+            response = requests.post(
+                self.base_url,
+                json=self._create_payload(
+                    self.create_message(messages),
+                    generate=generate,
+                    gen_kwargs=gen_kwargs,
+                    seed=self._seed,
+                    **kwargs,
+                ),
+                headers=self.header,
+            )
+            if not response.ok:
+                eval_logger.warning(
+                    f"API request failed with error message: {response.text}. Retrying..."
+                )
+            response.raise_for_status()
+            return response.json()
+        except RetryError:
+            eval_logger.error(
+                "API request failed after multiple retries. Please check the API status."
+            )
+            return None
+    async def amodel_call(
+        self,
+        session: ClientSession,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        *,
+        generate: bool = True,
+        cache_keys: list = None,
+        ctxlens: Optional[List[int]] = None,
+        gen_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Union[List[str], List[Tuple[float, bool]], None]:
+        # !!! Copy: shared dict for each request, need new object !!!
+        gen_kwargs = copy.deepcopy(gen_kwargs)
+        payload = self._create_payload(
+            self.create_message(messages),
+            generate=generate,
+            gen_kwargs=gen_kwargs,
+            seed=self._seed,
+            **kwargs,
+        )
+        cache_method = "generate_until" if generate else "loglikelihood"
+        try:
+            async with session.post(
+                self.base_url,
+                json=payload,
+                headers=self.header,
+            ) as response:
+                if not response.ok:
+                    error_text = await response.text()
+                    eval_logger.warning(
+                        f"API request failed with error message: {error_text}. Retrying..."
+                    )
+                # raising exception will retry the request
+                response.raise_for_status()
+                outputs = await response.json()
+            answers = (
+                self.parse_generations(
+                    outputs=outputs,
+                )
+                if generate
+                else self.parse_logprobs(
+                    outputs=outputs,
+                    tokens=messages,
+                    ctxlens=ctxlens,
+                )
+            )
+            if cache_keys:
+                for res, cache in zip(answers, cache_keys):
+                    self.cache_hook.add_partial(cache_method, cache, res)
+            return answers
+        # If the retries also fail
+        except RetryError:
+            eval_logger.error(
+                "API request failed after multiple retries. Please check the API status."
+            )
+            return None
+    def batch_logliklehood_requests(
+        self, chunks: Iterable[List[LogLikelihoodInputs]]
+    ) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]:
+        inputs = []
+        ctxlens = []
+        cache_keys = []
+        for chunk in chunks:
+            for cache_key, context_enc, continuation_enc in chunk:
+                inp = (context_enc + continuation_enc)[-(self.max_length) :]
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length)
+                )
+                inputs.append(inp)
+                ctxlens.append(ctxlen)
+                cache_keys.append(cache_key)
+        return inputs, ctxlens, cache_keys
+    async def get_batched_requests(
+        self,
+        requests: list,
+        cache_keys: list,
+        *,
+        generate: bool = True,
+        ctxlens: List[int] = None,
+        **kwargs,
+    ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
+        ctxlens = ctxlens if ctxlens else [None] * len(requests)
+        conn = TCPConnector(limit=self._concurrent)
+        async with ClientSession(connector=conn) as session:
+            retry_: Callable[..., Awaitable[Any]] = retry(
+                stop=stop_after_attempt(self.max_retries),
+                wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                reraise=True,
+            )(self.amodel_call)
+            # Create tasks for each batch of request
+            tasks = [
+                asyncio.create_task(
+                    retry_(
+                        session=session,
+                        messages=message,
+                        cache_keys=cache_key,
+                        generate=generate,
+                        ctxlens=ctxlen,
+                        **kwargs,
+                    )
+                )
+                for message, cache_key, ctxlen in zip(
+                    chunks(requests, n=self._batch_size),
+                    chunks(cache_keys, n=self._batch_size),
+                    chunks(ctxlens, n=self._batch_size),
+                )
+            ]
+            return await tqdm_asyncio.gather(*tasks, desc="Requesting API")
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        assert (
+            self.tokenizer is not None
+        ), "Tokenizer is required for loglikelihood tasks to compute context lengths."
+        res = []
+        def _collate(req: LogLikelihoodInputs):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by=None,
+        )
+        # if concurrent then we'll batch in the async context
+        chunked = re_ord.get_batched(n=self._batch_size if self._concurrent <= 1 else 0)
+        if self._concurrent <= 1:
+            pbar = tqdm(desc="Requesting API", total=len(requests))
+            for chunk in chunked:
+                inputs, ctxlens, cache_keys = self.batch_logliklehood_requests([chunk])
+                outputs = retry(
+                    stop=stop_after_attempt(self.max_retries),
+                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                    reraise=True,
+                )(self.model_call)(messages=inputs, generate=False)
+                if isinstance(outputs, dict):
+                    outputs = [outputs]
+                for answer_, cache_key in zip(
+                    self.parse_logprobs(
+                        outputs=outputs, tokens=inputs, ctxlens=ctxlens
+                    ),
+                    cache_keys,
+                ):
+                    if answer_ is not None:
+                        res.append(answer_)
+                        # partial caching
+                        if cache_key is not None:
+                            self.cache_hook.add_partial(
+                                "loglikelihood", cache_key, answer_
+                            )
+                        pbar.update(1)
+        else:
+            inputs, ctxlens, cache_keys = self.batch_logliklehood_requests(chunked)
+            res = itertools.chain.from_iterable(
+                asyncio.run(
+                    self.get_batched_requests(
+                        inputs, cache_keys, generate=False, ctxlens=ctxlens
+                    )
+                )
+            )
+        return re_ord.get_original(res)
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+        def _collate_gen(_requests):
+            # sort by the length of the non-tokenized contexts
+            return -len(_requests[0])
+        # Let the API deal with tokenization
+        requests, all_gen_kwargs = zip(*(req.args for req in requests))
+        if self.tokenized_requests:
+            encodings_list = self.tok_encode(
+                requests, add_special_tokens=self.add_bos_token
+            )
+        else:
+            encodings_list = [None] * len(requests)
+        requests = [
+            (a, b, c) for a, b, c in zip(requests, all_gen_kwargs, encodings_list)
+        ]
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate_gen,
+            group_by="gen_kwargs",
+        )
+        chunked = re_ord.get_batched(
+            n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None
+        )
+        if self._concurrent <= 1:
+            pbar = tqdm(desc="Requesting API", total=len(requests))
+            for chunk in chunked:
+                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                req = encodings_list if self.tokenized_requests else contexts
+                outputs = retry(
+                    stop=stop_after_attempt(self.max_retries),
+                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                    reraise=True,
+                )(self.model_call)(
+                    messages=req,
+                    generate=True,
+                    gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
+                )
+                for generated_text, context in zip(
+                    self.parse_generations(
+                        outputs=outputs,
+                        contexts=contexts,
+                    ),
+                    contexts,
+                ):
+                    if generated_text is not None:
+                        res.append(generated_text)
+                        # partial caching
+                        if context is not None:
+                            self.cache_hook.add_partial(
+                                "generate_until",
+                                (context, all_gen_kwargs[0]),
+                                generated_text,
+                            )
+                            pbar.update(1)
+        else:
+            for chunk in chunked:
+                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                req = encodings_list if self.tokenized_requests else contexts
+                results = itertools.chain.from_iterable(
+                    asyncio.run(
+                        self.get_batched_requests(
+                            req,
+                            cache_keys=[(ctx, all_gen_kwargs[0]) for ctx in contexts],
+                            generate=True,
+                            gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
+                        )
+                    )
+                )
+                res.extend(results)
+        return re_ord.get_original(res)
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.prefix_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+                disable_tqdm=True,
+            )
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods

scripts/yans/lm-evaluation-harness/lm_eval/models/huggingface.py ADDED Viewed

	@@ -0,0 +1,1356 @@

+import copy
+import os
+from datetime import timedelta
+from pathlib import Path
+from typing import Dict, List, Literal, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import transformers
+from accelerate import (
+    Accelerator,
+    InitProcessGroupKwargs,
+    find_executable_batch_size,
+)
+from accelerate.utils import get_max_memory
+from huggingface_hub import HfApi
+from packaging import version
+from peft import PeftModel
+from peft import __version__ as PEFT_VERSION
+from tqdm import tqdm
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+)
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import (
+    Collator,
+    clear_torch_cache,
+    configure_pad_token,
+    get_dtype,
+    pad_and_concat,
+    stop_sequences_criteria,
+)
+eval_logger = utils.eval_logger
+@register_model("hf-auto", "hf", "huggingface")
+class HFLM(TemplateLM):
+    """
+    An abstracted Huggingface model class. Enables usage with both models of
+    `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.
+    Supports data-parallel multi-GPU with HF Accelerate.
+    """
+    AUTO_MODEL_CLASS = None
+    _DEFAULT_MAX_LENGTH = 2048
+    def __init__(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
+        revision: Optional[str] = "main",
+        subfolder: Optional[str] = None,
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ] = None,
+        truncation: Optional[bool] = False,
+        logits_cache: bool = True,
+        max_length: Optional[int] = None,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        batch_size: Optional[Union[int, str]] = 1,
+        max_batch_size: Optional[int] = 64,
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        add_bos_token: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        parallelize: Optional[bool] = False,
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
+        # PEFT, delta weights and quantization options
+        peft: Optional[str] = None,
+        delta: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # optionally: take in an already-initialized transformers.PreTrainedModel
+        if not isinstance(pretrained, str):
+            eval_logger.warning(
+                "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
+            )
+            assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            self._model = pretrained
+            self._device = self._model.device
+            self._config = self._model.config
+            gpus = 0
+        else:
+            assert isinstance(device, str)
+            assert isinstance(pretrained, str)
+            assert isinstance(batch_size, (int, str))
+            gpus = torch.cuda.device_count()
+            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+            if accelerator.num_processes > 1:
+                self.accelerator = accelerator
+            if "npu" in accelerator.device.type:
+                gpus = torch.npu.device_count()
+            # using one process with no model parallelism
+            if not (parallelize or accelerator.num_processes > 1):
+                # use user-passed device
+                device_list = set(
+                    ["cuda", "cpu"]
+                    + [f"cuda:{i}" for i in range(gpus)]
+                    + ["mps", "mps:0"]
+                    + [f"npu:{i}" for i in range(gpus)]
+                )
+                if device and device in device_list:
+                    self._device = torch.device(device)
+                    eval_logger.info(f"Using device '{device}'")
+                    if device in ("mps", "mps:0") and version.parse(
+                        torch.__version__
+                    ) < version.parse("2.1"):
+                        raise RuntimeError(
+                            f"mps requires torch >= 2.1. You have {torch.__version__}"
+                        )
+                else:
+                    eval_logger.info("Device not specified")
+                    eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
+                    self._device = (
+                        torch.device("cuda")
+                        if torch.cuda.is_available()
+                        else torch.device("cpu")
+                    )
+            else:  # Parallelism managed by accelerate
+                if device != "cuda":
+                    eval_logger.info(
+                        f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
+                    )
+                # TODO: include in warning that `load_in_8bit` etc. affect this too
+                self._device = (
+                    self.accelerator.device
+                    if hasattr(self, "accelerator")
+                    else torch.device(device)
+                )
+            revision = str(revision)  # cast to string if not already one
+            # TODO: update this to be less of a hack once subfolder is fixed in HF
+            revision = revision + ("/" + subfolder if subfolder is not None else "")
+            self._get_config(
+                pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+            )
+        # determine which of 'causal' and 'seq2seq' backends to use
+        self._get_backend(
+            config=self.config, backend=backend, trust_remote_code=trust_remote_code
+        )
+        # load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
+        self._create_tokenizer(
+            pretrained,
+            tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast_tokenizer=use_fast_tokenizer,
+        )
+        # if we passed `pretrained` as a string, initialize our model now
+        if isinstance(pretrained, str):
+            self._create_model(
+                pretrained=pretrained,
+                revision=revision,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                parallelize=parallelize,
+                gpus=gpus,
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                peft=peft,
+                delta=delta,
+                autogptq=autogptq,
+                **kwargs,
+            )
+        # access self._model through self.model property outside this method
+        if isinstance(self.model, torch.nn.Module):
+            self.model.eval()
+            self.model.tie_weights()
+        self.truncation = truncation
+        self.logits_cache = logits_cache
+        self.vocab_size = self.tokenizer.vocab_size
+        # select (or create) a pad token to use
+        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
+        self.add_bos_token = add_bos_token
+        if "gemma" in getattr(self.config, "model_type", ""):
+            self.add_bos_token = True
+            eval_logger.info(
+                f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
+            )
+        self._max_length = max_length
+        self.pretrained = pretrained
+        self.delta = delta
+        self.peft = peft
+        self.revision = revision
+        self.batch_schedule = 1
+        self.batch_sizes = {}
+        self.max_batch_size = max_batch_size
+        if str(batch_size).startswith("auto"):
+            batch_size = batch_size.split(":")
+            self.batch_size_per_gpu = batch_size[0]
+            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
+        else:
+            self.batch_size_per_gpu = int(batch_size)
+        if isinstance(pretrained, str):
+            if gpus >= 1 or str(self.device) == "mps":
+                # TODO: can remove this whole snippet except in the mps case, perhaps?
+                if not (parallelize or autogptq or hasattr(self, "accelerator")):
+                    # place model onto device requested manually,
+                    # if not using HF Accelerate or device_map
+                    # or any other option that preloads model onto device
+                    try:
+                        self.model.to(self.device)
+                    except ValueError:
+                        eval_logger.debug(
+                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                        )
+            # multigpu data-parallel support when launched with accelerate
+            if gpus > 1:
+                if accelerator.num_processes > 1:
+                    if parallelize:
+                        eval_logger.warning(
+                            "You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
+                        )
+                    elif gpus > accelerator.num_processes:
+                        eval_logger.warning(
+                            "WARNING: The number of total system GPUs does not match the number of spawned processes. "
+                            "If you would like to use data parallelism, please launch the script "
+                            "with 'accelerate launch *script*'. "
+                            f"Current run will proceed with {accelerator.num_processes} devices."
+                        )
+                        if self.accelerator.is_local_main_process:
+                            eval_logger.info(
+                                f"Using {gpus} devices with data parallelism"
+                            )
+                    self._device = torch.device(f"{accelerator.device}")
+                    self.accelerator = accelerator
+                    self._rank = self.accelerator.local_process_index
+                    self._world_size = self.accelerator.num_processes
+                else:
+                    # if we aren't launching via accelerate, ditch
+                    self._rank = 0
+                    self._world_size = 1
+        else:
+            # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
+            eval_logger.warning(
+                "Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration"
+            )
+            self._rank = 0
+            self._world_size = 1
+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(
+                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
+            )
+    def _get_accelerate_args(
+        self,
+        parallelize: bool = None,
+        device_map: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        gpus: Optional[int] = None,
+    ) -> dict:
+        """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
+        num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
+        num_machines = int(os.environ.get("WORLD_SIZE", 0)) // num_local_processes
+        if (
+            num_machines == 0
+            and hasattr(self, "accelerator")
+            and self.accelerator is not None
+        ):
+            eval_logger.info(
+                "We are not in a distributed setting for accelerate. Setting model_parallel to False."
+            )
+            parallelize = False
+        if parallelize is None:
+            # If parallelism is unset by the user, we automatically assign model parallelism
+            # if enough extra GPUs are available
+            max_memory_all_gpus = get_max_memory()
+            # We just want gpu, not cpu, max memory
+            if "cpu" in max_memory_all_gpus:
+                del max_memory_all_gpus["cpu"]
+            parallelize = bool(num_local_processes < len(max_memory_all_gpus))
+            eval_logger.info(
+                f"Setting model parallel to {parallelize} since "
+                f"the number of local processes is {num_local_processes} "
+                f"and the number of GPUs is {len(max_memory_all_gpus)}"
+            )
+        args = {}
+        if parallelize:  # Model parallelism will be used
+            max_memory = {}
+            if max_memory_per_gpu is not None:  # Using the provided memory requirements
+                max_memory_per_gpu_map = {
+                    device_idx: max_memory_per_gpu for device_idx in range(gpus)
+                }
+            else:  # Estimating the possible memory requirements
+                max_memory_all_gpus = get_max_memory()
+                if "cpu" in max_memory_all_gpus:
+                    del max_memory_all_gpus["cpu"]
+                if not hasattr(self, "accelerator"):
+                    max_memory_per_gpu_map = {
+                        k: v for k, v in max_memory_all_gpus.items()
+                    }
+                else:
+                    # use only 1 / num_processes of the GPUs if we are running under accelerate launch
+                    max_memory_per_gpu_map = {
+                        k: v
+                        for k, v in max_memory_all_gpus.items()
+                        if k % num_local_processes
+                        == (self.accelerator.process_index % num_local_processes)
+                    }
+            args["max_memory"] = max_memory_per_gpu_map
+            args["device_map"] = "auto"
+            eval_logger.info(
+                f"Model parallel was set to True, setting max memory per GPU to {max_memory_per_gpu_map} and device map to 'auto'"
+            )
+            if max_cpu_memory is not None:
+                max_memory["cpu"] = max_cpu_memory
+            args["offload_folder"] = offload_folder
+        elif (
+            device_map is None
+        ):  # No model parallelism, we use the default provided device for our model
+            if hasattr(self, "accelerator"):
+                device_map = {"": f"{self.accelerator.device}"}
+            else:
+                device_map = {"": str(self.device)}
+            args["max_memory"] = None
+            args["device_map"] = device_map
+            eval_logger.info(
+                f"Model parallel was set to False, max memory was not set, and device map was set to {device_map}"
+            )
+        else:
+            args["max_memory"] = None
+            args["device_map"] = None
+            eval_logger.info("Model parallel was set to False.")
+        return args
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+    @property
+    def model(self):
+        # returns the model, unwrapping it if using Accelerate
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
+    @property
+    def max_length(self):
+        if self._max_length:  # if max length manually set, return it
+            return self._max_length
+        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+        for attr in seqlen_config_attrs:
+            if hasattr(self.model.config, attr):
+                return getattr(self.model.config, attr)
+        if hasattr(self.tokenizer, "model_max_length"):
+            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+                return self._DEFAULT_MAX_LENGTH
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+    @property
+    def device(self):
+        return self._device
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+    @property
+    def chat_template(self) -> str:
+        if self.tokenizer.chat_template is not None:
+            return self.tokenizer.chat_template
+        return self.tokenizer.default_chat_template
+    def _get_backend(
+        self,
+        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
+        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        trust_remote_code: Optional[bool] = False,
+    ) -> None:
+        """
+        Helper method during initialization.
+        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
+        model type to be used.
+        """
+        assert backend in ["default", "causal", "seq2seq"]
+        if backend != "default":
+            # if we've settled on non-default backend, use that manually
+            if backend == "causal":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+            elif backend == "seq2seq":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+            eval_logger.info(
+                f"Overrode HF model backend type, and using type '{backend}'"
+            )
+        else:
+            # determine and use the default HF backend for this model, based on its config + metadata.
+            if (
+                getattr(config, "model_type")
+                in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+            ):
+                # first check if model type is listed under seq2seq models, since some
+                # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
+                # these special cases should be treated as seq2seq models.
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+            elif (
+                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+            ):
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+            else:
+                if not trust_remote_code:
+                    eval_logger.warning(
+                        "HF model type is neither marked as CausalLM or Seq2SeqLM. \
+                    This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
+                    )
+                # if model type is neither in HF transformers causal or seq2seq model registries
+                # then we default to AutoModelForCausalLM
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+        assert self.AUTO_MODEL_CLASS in [
+            transformers.AutoModelForCausalLM,
+            transformers.AutoModelForSeq2SeqLM,
+        ]
+        return None
+    def _get_config(
+        self,
+        pretrained: str,
+        revision: str = "main",
+        trust_remote_code: bool = False,
+    ) -> None:
+        self._config = transformers.AutoConfig.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+    def _create_model(
+        self,
+        pretrained: str,
+        revision: Optional[str] = "main",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        trust_remote_code: Optional[bool] = False,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        # (accelerate naive PP (device_map) options)
+        parallelize: Optional[bool] = False,
+        gpus: Optional[int] = None,
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        # PEFT, delta weights and quantization options
+        peft: Optional[str] = None,
+        delta: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        **kwargs,
+    ) -> None:
+        """
+        Initializes an HF or HF-compatible PreTrainedModel from scratch
+        inside HFLM, using the kwargs passed into self.__init__().
+        Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
+        For future similar extensions to AutoGPTQ that are not core to HF's ecosystem,
+        (such as PyTorch models that are nearly, but not quite, fully mirroring
+        HF's public interface relied on in this HFLM class)
+        please consider subclassing HFLM and overriding this and other methods as needed.
+        """
+        model_kwargs = kwargs if kwargs else {}
+        model_kwargs.update(
+            self._get_accelerate_args(
+                parallelize=parallelize,
+                device_map=kwargs.get("device_map", None),
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                gpus=gpus,
+            )
+        )
+        if not autogptq:
+            if model_kwargs.get("load_in_4bit", None):
+                assert (
+                    transformers.__version__ >= "4.30.0"
+                ), "load_in_4bit requires transformers >= 4.30.0"
+            if transformers.__version__ >= "4.30.0":
+                if model_kwargs.get("load_in_4bit", None):
+                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
+                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
+                            model_kwargs["bnb_4bit_compute_dtype"]
+                        )
+            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
+                pretrained,
+                revision=revision,
+                torch_dtype=get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+        else:
+            try:
+                from auto_gptq import AutoGPTQForCausalLM
+            except ModuleNotFoundError:
+                raise Exception(
+                    "Tried to load auto_gptq, but auto-gptq is not installed ",
+                    "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
+                )
+            self._model = AutoGPTQForCausalLM.from_quantized(
+                pretrained,
+                trust_remote_code=trust_remote_code,
+                model_basename=None if autogptq is True else Path(autogptq).stem,
+                use_safetensors=True
+                if autogptq is True
+                else autogptq.endswith(".safetensors"),
+                **model_kwargs,
+            )
+        if peft and delta:
+            raise ValueError(
+                "Cannot use both 'peft' and 'delta' options at the same time."
+            )
+        if peft:
+            if model_kwargs.get("load_in_4bit", None):
+                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
+                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
+            if self._model.config.vocab_size != len(self.tokenizer):
+                # resize model for LoRAs with added tokens
+                self._model.resize_token_embeddings(len(self.tokenizer))
+                eval_logger.info(
+                    f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
+                )
+            self._model = PeftModel.from_pretrained(
+                self._model, peft, revision=revision
+            )
+        elif delta:
+            if autogptq:
+                eval_logger.warning(
+                    "Delta weights might trigger unexpected behavior when used with AutoGPTQ."
+                )
+            _model_delta = self.AUTO_MODEL_CLASS.from_pretrained(
+                delta,
+                revision=revision,
+                torch_dtype=get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+            for name, param in self._model.state_dict().items():
+                try:
+                    param.data += _model_delta.state_dict()[name]
+                except KeyError:
+                    raise KeyError(f"Delta model is missing weights for layer: {name}")
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to add delta weights to layer {name}. Error: {e}"
+                    )
+            del _model_delta
+        return None
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ],
+        revision: Optional[str] = "main",
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+    ) -> None:
+        """
+        Helper method during initialization.
+        Create a tokenizer object corresponding to the correct
+        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
+        """
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    tokenizer,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    use_fast=use_fast_tokenizer,
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                self.tokenizer = tokenizer
+        else:
+            # Get tokenizer based on 'pretrained'
+            if isinstance(pretrained, str):
+                model_name = pretrained
+            else:
+                # get the HF hub name via accessor on model
+                model_name = self.model.name_or_path
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                model_name,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                use_fast=use_fast_tokenizer,
+            )
+        return None
+    def _detect_batch_size(self, requests=None, pos: int = 0):
+        if requests:
+            _, context_enc, continuation_enc = requests[pos]
+            max_length = len(
+                (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+            )
+            max_context_enc = len(context_enc[-(self.max_length + 1) :])
+            max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
+        else:
+            max_length = self.max_length
+            max_context_enc = max_length
+            max_cont_enc = max_length
+        # if OOM, then halves batch_size and tries again
+        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
+        def forward_batch(batch_size):
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                length = max(max_context_enc, max_cont_enc)
+                batched_conts = torch.ones(
+                    (batch_size, length), device=self.device
+                ).long()
+                test_batch = torch.ones((batch_size, length), device=self.device).long()
+                call_kwargs = {
+                    "attn_mask": test_batch,
+                    "labels": batched_conts,
+                }
+            else:
+                call_kwargs = {}
+                test_batch = torch.ones(
+                    (batch_size, max_length), device=self.device
+                ).long()
+            for _ in range(5):
+                out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1)  # noqa: F841
+            return batch_size
+        try:
+            batch_size = forward_batch()
+        except RuntimeError as e:
+            if "No executable batch size found" in str(e):
+                batch_size = 1
+            else:
+                raise
+        if self.world_size > 1:
+            # if multi-GPU, always take minimum over all selected batch sizes
+            max_rnk_bs = torch.tensor([batch_size], device=self.device)
+            gathered = (
+                self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
+            )
+            batch_size = min(gathered)
+            clear_torch_cache()
+            return batch_size
+        clear_torch_cache()
+        return batch_size
+    def tok_encode(
+        self, string: str, left_truncate_len=None, add_special_tokens=None
+    ) -> List[int]:
+        """ """
+        # default for None - empty dict, use predefined tokenizer param
+        # used for all models except for CausalLM or predefined value
+        special_tokens_kwargs = {}
+        # by default for CausalLM - false or self.add_bos_token is set
+        if add_special_tokens is None:
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                special_tokens_kwargs = {
+                    "add_special_tokens": False or self.add_bos_token
+                }
+        # otherwise the method explicitly defines the value
+        else:
+            special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
+        encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+        return encoding
+    def tok_batch_encode(
+        self,
+        strings: List[str],
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+        add_special_tokens = {}
+        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+        encoding = self.tokenizer(
+            strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            **add_special_tokens,
+        )
+        if left_truncate_len:
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side
+        return encoding["input_ids"], encoding["attention_mask"]
+    def tok_decode(self, tokens, skip_special_tokens=True):
+        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
+    def _model_call(self, inps, attn_mask=None, labels=None):
+        """
+        :param inps: torch.Tensor
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
+            [batch, sequence_ctx]. the size of sequence may vary from call to call
+        :param attn_mask: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :param labels: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :return
+            A torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model's decoder
+        """
+        with torch.no_grad():
+            if attn_mask is not None or labels is not None:
+                assert attn_mask is not None and labels is not None
+                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
+                return self.model(
+                    input_ids=inps, attention_mask=attn_mask, labels=labels
+                ).logits
+            else:
+                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                return self.model(inps).logits
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        # temperature = 0.0 if not set
+        # if do_sample is false and temp==0.0:
+        # remove temperature, as do_sample=False takes care of this
+        # and we don't want a warning from HF
+        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", None)
+        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+            generation_kwargs["do_sample"] = do_sample = False
+        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+            generation_kwargs.pop("temperature")
+        # build stopping criteria
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer, stop, context.shape[1], context.shape[0]
+        )
+        return self.model.generate(
+            input_ids=context,
+            max_length=max_length,
+            stopping_criteria=stopping_criteria,
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=True,
+            **generation_kwargs,
+        )
+    def _select_cont_toks(
+        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
+    ) -> torch.Tensor:
+        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            assert (
+                contlen and inplen
+            ), "Must pass input len and cont. len to select scored logits for causal LM"
+            # discard right-padding.
+            # also discard the input/context tokens. we'll only score continuations.
+            logits = logits[inplen - contlen : inplen]
+        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            assert (
+                contlen and not inplen
+            ), "Selecting scored logits for Seq2SeqLM requires only cont. len"
+            # only discard right-padding.
+            # the logits input to this fn only contain decoder-side tokens.
+            logits = logits[:contlen]
+        return logits
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
+        for (string,) in tqdm(
+            [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
+        ):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.prefix_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+            pad_amnt = 0
+            if self.world_size > 1:
+                # We pad out the external document-level iterator so the inner iterator doesn't hang
+                mytensor = torch.tensor(len(rolling_token_windows), device=self.device)
+                gathered = (
+                    self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
+                )
+                pad_amnt = max(gathered) - gathered[self.rank]
+                if pad_amnt > 0:
+                    rolling_token_windows += pad_amnt * [rolling_token_windows[0]]
+            string_nll = self._loglikelihood_tokens(
+                requests=rolling_token_windows,
+                disable_tqdm=True,
+                override_bs=adaptive_batch_size,
+            )
+            if (self.world_size > 1) and (pad_amnt > 0):
+                string_nll = [x[0] for x in string_nll[:-pad_amnt]]
+            else:
+                # discard is_greedy
+                string_nll = [x[0] for x in string_nll]
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
+    def _batch_scheduler(self, pos, n_reordered_requests):
+        sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
+        if sched in self.batch_sizes:
+            return self.batch_sizes[sched]
+        if (len(self.batch_sizes) > 1) and (
+            self.batch_sizes[sched - 1] == self.max_batch_size
+        ):
+            # if previous batch size is already maximal, skip recomputation
+            self.batch_sizes[sched] = self.max_batch_size
+            return self.batch_sizes[sched]
+        print(
+            f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
+        )
+        self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
+        print(f"Determined largest batch size: {self.batch_sizes[sched]}")
+        return self.batch_sizes[sched]
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+        override_bs: int = None,
+    ) -> List[Tuple[float, bool]]:
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations"""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-2] + req[-1][:-1]
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by="contexts"
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+            and self.logits_cache
+            else None,
+            group_fn=_lookup_one_token_cont,
+        )
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None
+        )
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            cont_toks_list = []
+            inplens = []
+            conts = []
+            encoder_attns = []
+            padding_len_inp = None
+            padding_len_cont = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+                # when too long to fit in context, truncate from the left
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                    inp = torch.tensor(
+                        (context_enc)[-self.max_length :],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+                    # build encoder attn masks
+                    encoder_attns.append(torch.ones_like(inp))
+                    cont = torch.tensor(
+                        (continuation_enc)[-self.max_length :],
+                        # TODO: left-shift these?
+                        # TODO: our code assumes we never end up truncating conts for either model type
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (contlen,) = cont.shape
+                    conts.append(cont)
+                    padding_len_cont = (
+                        max(padding_len_cont, contlen)
+                        if padding_len_cont is not None
+                        else contlen
+                    )
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps, padding_side="right"
+                )  # [batch, padding_len_inp]
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # TODO: left-pad encoder inps and mask?
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps
+                )  # [batch, padding_len_inp]
+                batched_conts = pad_and_concat(
+                    padding_len_cont, conts
+                )  # [batch, padding_len_cont]
+                batched_encoder_mask = pad_and_concat(
+                    padding_len_inp, encoder_attns
+                )  # [batch, padding_len_inp]
+                call_kwargs = {
+                    "attn_mask": batched_encoder_mask,
+                    "labels": batched_conts,
+                }
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps, **call_kwargs), dim=-1
+            )  # [batch, padding_length (inp or cont), vocab]
+            for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (logits.shape[0] - padding_len_inp)
+                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    else None
+                )
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+                # check for one-token continuation cache hits.
+                # noop in case group_by != "contexts" or no cache hit and returns the
+                # original args. Otherwise, expands the logits batch dimension and yields each
+                # batch along with matching continuation tokens and prompt strings.
+                # logits -> [1, seq, vocab]
+                for request_str, cont_toks, logits in re_ord.get_cache(
+                    req_str=request_str,
+                    cxt_toks=ctx_tokens,
+                    cont_toks=cont_toks,
+                    logits=logits,
+                ):
+                    cont_toks = torch.tensor(
+                        cont_toks, dtype=torch.long, device=self.device
+                    ).unsqueeze(0)  # [1, seq]
+                    max_equal = (greedy_tokens == cont_toks).all()
+                    # Obtain log-probs at the corresponding continuation token indices
+                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                        -1
+                    )  # [1, seq]
+                    # Answer: (log prob, is-exact-match)
+                    answer = (float(logits.sum()), bool(max_equal))
+                    res.append(answer)
+                    self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                    pbar.update(1)
+        pbar.close()
+        return re_ord.get_original(res)
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+        def _collate(req: Tuple[str, dict]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(req[0])
+            return -len(toks), req[0]
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests",
+        )
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
+        # for each different set of kwargs, we execute all requests, by batch.
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else adaptive_batch_size
+            if adaptive_batch_size is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto" and not adaptive_batch_size
+            else None
+        )
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        # group_fn=lambda x: x[1] -> x=(context, gen_kwargs)
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            sort_fn=_collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [until]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            # add EOS token to stop sequences
+            eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+            # set the max length in tokens of inputs ("context_enc")
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # max len for inputs = encoder's whole max_length
+                max_ctx_len = self.max_length
+            # encode, pad, and truncate contexts for this batch
+            context_enc, attn_masks = self.tok_batch_encode(
+                contexts,
+                left_truncate_len=max_ctx_len,
+                truncation=self.truncation,
+            )
+            context_enc = context_enc.to(self.device)
+            attn_masks = attn_masks.to(self.device)
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+            # perform batched generation
+            cont = self._model_generate(
+                context=context_enc,
+                attention_mask=attn_masks,
+                stop=until,
+                **kwargs,
+            )
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only LM
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    cont_toks = cont_toks[context_enc.shape[1] :]
+                s = self.tok_decode(cont_toks)
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+                res.append(s)
+                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+        pbar.close()
+        return res
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        return self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
+        )
+    def get_model_info(self) -> dict:
+        """
+        Method to get Hugging Face model information for experiment reproducibility.
+        """
+        def get_model_num_params(model) -> int:
+            if hasattr(model, "num_parameters"):
+                return model.num_parameters()
+            if hasattr(model, "parameters"):
+                return sum(p.numel() for p in model.parameters())
+            else:
+                return -1
+        def get_model_dtype(model) -> str:
+            if hasattr(model, "dtype"):
+                return model.dtype
+            else:
+                return ""
+        def get_model_sha(pretrained: str, revision: str) -> str:
+            try:
+                model_info = HfApi().model_info(repo_id=pretrained, revision=revision)
+                return model_info.sha
+            except Exception as e:
+                eval_logger.warn(
+                    f"Failed to get model SHA for {pretrained} at revision {revision}. Error: {e}"
+                )
+                return ""
+        model_info = {
+            "model_num_parameters": get_model_num_params(self._model),
+            "model_dtype": get_model_dtype(self._model),
+            "model_revision": self.revision,
+            "model_sha": get_model_sha(self.pretrained, self.revision),
+        }
+        if self.peft:
+            model_info["peft_sha"] = get_model_sha(self.peft, self.revision)
+        if self.delta:
+            model_info["delta_sha"] = get_model_sha(self.delta, self.revision)
+        return model_info

scripts/yans/lm-evaluation-harness/lm_eval/models/nemo_lm.py ADDED Viewed

	@@ -0,0 +1,537 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import pathlib
+from copy import deepcopy
+from typing import List, Literal
+import filelock
+import numpy as np
+import torch
+from tqdm import tqdm
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import Collator
+from lm_eval.utils import (
+    eval_logger,
+    get_rolling_token_windows,
+    make_disjoint_window,
+    simple_parse_args_string,
+)
+def _patch_pretrained_cfg(
+    pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
+):
+    try:
+        import omegaconf
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+    omegaconf.OmegaConf.set_struct(pretrained_cfg, True)
+    with omegaconf.open_dict(pretrained_cfg):
+        attributes_to_update = {
+            "sequence_parallel": False,
+            "activations_checkpoint_granularity": None,
+            "activations_checkpoint_method": None,
+            "precision": trainer.precision,
+            "global_batch_size": None,
+            "tensor_model_parallel_size": tensor_model_parallel_size,
+            "pipeline_model_parallel_size": pipeline_model_parallel_size,
+            "apply_rope_fusion": False,
+        }
+        for name, value in attributes_to_update.items():
+            if hasattr(pretrained_cfg, name):
+                pretrained_cfg[name] = value
+    return pretrained_cfg
+def _get_target_from_class(target_class) -> str:
+    return f"{target_class.__module__}.{target_class.__name__}"
+def load_model(
+    model_path: str,
+    trainer,
+    tensor_model_parallel_size: int,
+    pipeline_model_parallel_size: int,
+) -> torch.nn.Module:
+    try:
+        from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import (
+            MegatronGPTModel,
+        )
+        from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+    model_path = pathlib.Path(model_path)
+    save_restore_connector = NLPSaveRestoreConnector()
+    if model_path.is_dir():
+        save_restore_connector.model_extracted_dir = model_path.as_posix()
+    pretrained_cfg = save_restore_connector.restore_from(
+        None, model_path.as_posix(), return_config=True, trainer=trainer
+    )
+    if not hasattr(pretrained_cfg, "target"):
+        pretrained_cfg["target"] = _get_target_from_class(MegatronGPTModel)
+    pretrained_cfg = _patch_pretrained_cfg(
+        pretrained_cfg,
+        trainer,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        pipeline_model_parallel_size=pipeline_model_parallel_size,
+    )
+    model_to_load_path = model_path
+    override_config = pretrained_cfg
+    module_name, class_name = override_config.target.rsplit(".", 1)
+    model_class = getattr(importlib.import_module(module_name), class_name)
+    # monkeypatch _build_tokenizer method to be process-safe
+    tokenizer_lock = filelock.FileLock(f"/tmp/{model_path.name}.tokenizer.lock")
+    def _synced_build_tokenizer(self):
+        with tokenizer_lock:
+            self._original_build_tokenizer()
+    model_class._original_build_tokenizer = model_class._build_tokenizer
+    model_class._build_tokenizer = _synced_build_tokenizer
+    model = model_class.restore_from(
+        restore_path=model_to_load_path.as_posix(),
+        trainer=trainer,
+        override_config_path=override_config,
+        save_restore_connector=save_restore_connector,
+        map_location=f"cuda:{trainer.local_rank}",
+    )
+    model.freeze()
+    model.training = False
+    try:
+        # Have to turn off activations_checkpoint_method for inference
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+    return model
+def setup_distributed_environment(trainer):
+    try:
+        from nemo.utils.app_state import AppState
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+    def dummy():
+        return
+    if trainer.strategy.launcher is not None:
+        trainer.strategy.launcher.launch(dummy, trainer=trainer)
+    trainer.strategy.setup_environment()
+    app_state = AppState()
+    return app_state
+@register_model("nemo_lm")
+class NeMoLM(LM):
+    def __init__(
+        self,
+        path: str,
+        max_length: int = 4096,
+        batch_size: int = 1,
+        max_gen_toks: int = 256,
+        devices: int = 1,
+        num_nodes: int = 1,
+        tensor_model_parallel_size: int = 1,
+        pipeline_model_parallel_size: int = 1,
+        precision: Literal[
+            "16-mixed",
+            "bf16-mixed",
+            "32-true",
+            "64-true",
+            64,
+            32,
+            16,
+            "64",
+            "32",
+            "16",
+            "bf16",
+        ] = "bf16",
+        **kwargs,
+    ):
+        try:
+            from nemo.collections.nlp.modules.common.text_generation_utils import (
+                generate,
+            )
+            from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+            from pytorch_lightning.trainer.trainer import Trainer
+            self.generate = generate
+        except ModuleNotFoundError:
+            raise Exception(
+                "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+                "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+                "or installing nemo following https://github.com/NVIDIA/NeMo.",
+            )
+        super().__init__()
+        if (
+            tensor_model_parallel_size == 1
+            and pipeline_model_parallel_size == 1
+            and devices > 1
+        ):
+            eval_logger.info(
+                f"The number of data replicas for evaluation is {devices}."
+            )
+            eval_logger.info(f"The total number of devices is {devices}.")
+            eval_logger.info(
+                "No tensor parallelism or pipeline parallelism is applied."
+            )
+        elif tensor_model_parallel_size * pipeline_model_parallel_size == devices:
+            eval_logger.info(
+                f"Setting tensor parallelism to {tensor_model_parallel_size} and pipeline parallelism to {pipeline_model_parallel_size}."
+            )
+            eval_logger.info(f"The total number of devices is {devices}.")
+            eval_logger.info("No data parallelism is applied.")
+        else:
+            raise ValueError(
+                "Please set the product of tensor_model_parallel_size and pipeline_model_parallel_size"
+                "equal to the specified number of devices."
+            )
+        if num_nodes > 1:
+            raise ValueError(
+                "A number of nodes greater than 1 is not supported yet. Please set num_nodes as 1."
+            )
+        trainer = Trainer(
+            strategy=NLPDDPStrategy(),
+            devices=devices,
+            accelerator="gpu",
+            num_nodes=num_nodes,
+            precision=precision,
+            logger=False,
+            enable_checkpointing=False,
+            use_distributed_sampler=False,
+        )
+        # Modify the following flags only for data replication
+        if (
+            tensor_model_parallel_size == 1
+            and pipeline_model_parallel_size == 1
+            and devices > 1
+        ):
+            self._device = torch.device(f"cuda:{trainer.global_rank}")
+            self._rank = trainer.global_rank
+            self._world_size = trainer.world_size
+        self.model = load_model(
+            path,
+            trainer,
+            tensor_model_parallel_size=tensor_model_parallel_size,
+            pipeline_model_parallel_size=pipeline_model_parallel_size,
+        ).cuda()
+        self.tokenizer = self.model.tokenizer
+        self.app_state = setup_distributed_environment(trainer)
+        self._max_length = max_length
+        self._batch_size = int(batch_size)
+        self._max_gen_toks = max_gen_toks
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        args = simple_parse_args_string(arg_string)
+        if additional_config:
+            args["batch_size"] = additional_config.get("batch_size", 1)
+        return cls(**args)
+    @property
+    def eot_token_id(self):
+        try:
+            return self.tokenizer.eos_id
+        except AttributeError:
+            return None
+    @property
+    def max_length(self):
+        return self._max_length
+    @property
+    def max_gen_toks(self):
+        return self._max_gen_toks
+    @property
+    def batch_size(self):
+        return self._batch_size
+    @property
+    def device(self):
+        return self._device
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
+    @property
+    def accelerator(self):
+        return self._Accelerator(self.world_size)
+    class _Accelerator:
+        def __init__(self, world_size):
+            self.world_size = world_size
+        def wait_for_everyone(self):
+            torch.distributed.barrier()
+        def gather(self, local_tensor):
+            gathered_tensors = [
+                torch.zeros(1, dtype=local_tensor.dtype).cuda()
+                for _ in range(self.world_size)
+            ]
+            torch.distributed.all_gather(gathered_tensors, local_tensor)
+            return torch.cat(gathered_tensors)
+    def tok_encode(self, string: str):
+        return self.tokenizer.text_to_ids(string)
+    def tok_decode(self, tokens):
+        return self.tokenizer.ids_to_text(tokens)
+    def _encode_pair(self, context, continuation):
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tok_encode(context + continuation)
+        context_enc = self.tok_encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                # end of text as context
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
+                )
+            else:
+                context_enc, continuation_enc = self._encode_pair(context, continuation)
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+        return self._loglikelihood_tokens(new_reqs)
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            rolling_token_windows = list(
+                map(
+                    make_disjoint_window,
+                    get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length - 1,
+                        context_len=1,
+                    ),
+                )
+            )
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+            )
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        res = []
+        def _collate(x):
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+        re_ord = Collator(requests, sort_fn=_collate)
+        chunks = re_ord.get_batched(n=self.batch_size, batch_fn=None)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            ctxlens = []
+            contlens = []
+            for _, context_enc, continuation_enc in chunk:
+                # Leave one token for generation. Tokens_to_generate = 0 breaks NeMo.
+                inp = (context_enc + continuation_enc)[-(self.max_length - 1) :]
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length - 1)
+                )
+                ctxlens.append(ctxlen)
+                contlens.append(len(continuation_enc))
+                inps.append(self.tok_decode(inp))
+            output = self.generate(
+                self.model,
+                inputs=inps,
+                tokens_to_generate=1,
+                min_tokens_to_generate=1,
+                compute_logprob=True,
+                all_probs=True,
+            )
+            batch_token_ids = np.asarray(output["token_ids"])[:, :-1]
+            batch_logprobs = output["logprob"][:, :-1]
+            batch_full_logprob = output["full_logprob"][:, :-1, :]
+            # Compute greedy tokens for entire batch rather than calling it with proper ctxlen for each sample.
+            # Additional tokens for each sample will be trimmed later.
+            min_ctxlen = min(ctxlens)
+            # Use min_ctxlen-1 instead of min_ctxlen since full_logprobs are not returns for the first token.
+            batch_greedy_tokens = (
+                torch.argmax(batch_full_logprob[:, min_ctxlen - 1 :, :], -1)
+                .cpu()
+                .numpy()
+            )
+            for token_ids, greedy_tokens, logprobs, ctxlen, contlen, (
+                cache_key,
+                _,
+                _,
+            ) in zip(
+                batch_token_ids,
+                batch_greedy_tokens,
+                batch_logprobs,
+                ctxlens,
+                contlens,
+                chunk,
+            ):
+                # Trim at contlen since shorter contexts in a batch will have more than one token generated.
+                # Use ctxlen-1 instead of ctxlen same as for full_logprob in batch_greedy_tokens calculation
+                logprobs = (logprobs[ctxlen - 1 :])[:contlen]
+                logprob = sum(logprobs).tolist()
+                continuation_tokens = (token_ids[ctxlen:])[:contlen]
+                len_diff = ctxlen - min_ctxlen
+                is_greedy = continuation_tokens == (greedy_tokens[len_diff:])[:contlen]
+                if not isinstance(is_greedy, bool):
+                    is_greedy = is_greedy.all()
+                answer = (logprob, is_greedy)
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                res.append(answer)
+                pbar.update(1)
+        pbar.close()
+        return re_ord.get_original(res)
+    def generate_until(self, requests):
+        if not requests:
+            return []
+        res = []
+        def get_until(req_args):
+            until = req_args.get("until", [])
+            until = deepcopy(until)  # prevent from modifying req_args for cache_key
+            if self.tokenizer.ids_to_tokens([self.eot_token_id])[0] not in until:
+                until.append(self.tokenizer.ids_to_tokens([self.eot_token_id])[0])
+            return until
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+        re_ords = Collator(
+            [reg.args for reg in requests], sort_fn=_collate, group_by="gen_kwargs"
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            req_args = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = get_until(req_args)
+            max_gen_toks = req_args.get("max_gen_toks", self.max_gen_toks)
+            remaining_length = self.max_length - max_gen_toks
+            contexts = []
+            for context, _ in chunk:
+                encoded_context = self.tok_encode(context)
+                encoded_context = encoded_context[-remaining_length:]
+                contexts.append(self.tok_decode(encoded_context))
+            output = self.generate(
+                self.model,
+                inputs=contexts,
+                tokens_to_generate=max_gen_toks,
+                end_strings=until,
+                greedy=True,
+            )
+            answers = output["sentences"]
+            continuations = []
+            for context, answer in zip(contexts, answers):
+                continuations.append(answer[len(context) :])
+            for term in until:
+                continuations = [answer.split(term)[0] for answer in continuations]
+            for request, answer in zip(chunk, continuations):
+                self.cache_hook.add_partial("greedy_until", request, answer)
+                res.append(answer)
+        return re_ords.get_original(res)