diff --git a/scripts/decode/en-ja/llama2/beam_search.sh b/scripts/decode/en-ja/llama2/beam_search.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8cf0818befb9ee56e05db71bbbf264db248fe4c5
--- /dev/null
+++ b/scripts/decode/en-ja/llama2/beam_search.sh
@@ -0,0 +1,19 @@
+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+
+MAX_INPUT_TOKENS=158
+BEAM_SIZE=50
+
+python /code/llm-recipes/tools/hf_inference_distrubuted.py \
+       --model /work/models/additiona_trained_hf/llama2-en-ja-continuous-pretrained-v0-dev-finetune-chunked-docs-all-averaged-841-845 \
+       -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \
+       -o /work/translation/wmt2024_test/en-ja/llama2-beam \
+       -g 0 1 2 3 4 5 6 7 \
+       --attn_implementation sdpa \
+       --dynamic_max_new_token_ratio 3.0 \
+       --num_return_sequences ${BEAM_SIZE} \
+       --num_beams ${BEAM_SIZE} \
+       --max_input_tokens ${MAX_INPUT_TOKENS} \
+       -b 158
+
diff --git a/scripts/decode/en-ja/llama2/greedy_inference.sh b/scripts/decode/en-ja/llama2/greedy_inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1fe3cb36a1b5ed6964b442053e94c2b5365775a7
--- /dev/null
+++ b/scripts/decode/en-ja/llama2/greedy_inference.sh
@@ -0,0 +1,13 @@
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+
+python /code/llm-recipes/tools/hf_inference.py \
+       --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \
+       -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl  \
+       -o /work/translation/wmt24_test/en-ja/mistral-greedy \
+       -g 0 \
+       -b 4096 \
+       --dynamic_max_new_token_ratio 3.0
+
+echo "Done!"
+
diff --git a/scripts/decode/en-ja/llama2/hf_inference.sh b/scripts/decode/en-ja/llama2/hf_inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1fe3cb36a1b5ed6964b442053e94c2b5365775a7
--- /dev/null
+++ b/scripts/decode/en-ja/llama2/hf_inference.sh
@@ -0,0 +1,13 @@
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+
+python /code/llm-recipes/tools/hf_inference.py \
+       --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \
+       -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl  \
+       -o /work/translation/wmt24_test/en-ja/mistral-greedy \
+       -g 0 \
+       -b 4096 \
+       --dynamic_max_new_token_ratio 3.0
+
+echo "Done!"
+
diff --git a/scripts/decode/en-ja/llama2/top_p_inference.sh b/scripts/decode/en-ja/llama2/top_p_inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f5315754a87cc492924da6d16fc5421401199cbe
--- /dev/null
+++ b/scripts/decode/en-ja/llama2/top_p_inference.sh
@@ -0,0 +1,17 @@
+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+
+i=4
+GPU_ID=4
+python /code/llm-recipes/tools/hf_inference.py \
+       --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \
+       -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i}  \
+       -o /work/translation/wmt24_test/en-ja/mistral-top-p-0.95/split_0${i} \
+       -g ${GPU_ID} \
+       -b 500 \
+       --attn_implementation sdpa \
+       --dynamic_max_new_token_ratio 3.0 \
+       --num_return_sequences 100 \
+       --do_sample \
+       --top_p 0.95 &
diff --git a/scripts/decode/en-ja/llama2/top_p_inference_1.sh b/scripts/decode/en-ja/llama2/top_p_inference_1.sh
new file mode 100644
index 0000000000000000000000000000000000000000..118db3b3d19f0261b103102bcf8001c9378c232d
--- /dev/null
+++ b/scripts/decode/en-ja/llama2/top_p_inference_1.sh
@@ -0,0 +1,20 @@
+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+
+for i in `seq 0 6`; do
+    python /code/llm-recipes/tools/hf_inference.py \
+	   --model /work/models/additiona_trained_hf/llama2-en-ja-continuous-pretrained-v0-dev-finetune-chunked-docs-all-averaged-841-845 \
+	   -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i} \
+	   -o /work/translation/wmt24_test/en-ja/llama2-top-p-0.95/split_0${i} \
+	   -g ${i} \
+	   -b 158 \
+	   --attn_implementation sdpa \
+	   --dynamic_max_new_token_ratio 3.0 \
+	   --num_return_sequences 50 \
+	   --do_sample \
+	   --top_p 0.95 \
+	   --max_input_tokens 158 &
+done
+wait
+
diff --git a/scripts/decode/en-ja/llama2/top_p_inference_2.sh b/scripts/decode/en-ja/llama2/top_p_inference_2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a5e987894a82f07fe482302d15df7546103d9c4a
--- /dev/null
+++ b/scripts/decode/en-ja/llama2/top_p_inference_2.sh
@@ -0,0 +1,21 @@
+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+
+for i in `seq 7 9`; do
+    GPU_ID=$((i-5))
+    python /code/llm-recipes/tools/hf_inference.py \
+	   --model /work/models/additiona_trained_hf/llama2-en-ja-continuous-pretrained-v0-dev-finetune-chunked-docs-all-averaged-841-845 \
+	   -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i} \
+	   -o /work/translation/wmt24_test/en-ja/llama2-top-p-0.95/split_0${i} \
+	   -g ${GPU_ID} \
+	   -b 158 \
+	   --attn_implementation sdpa \
+	   --dynamic_max_new_token_ratio 3.0 \
+	   --num_return_sequences 50 \
+	   --do_sample \
+	   --top_p 0.95 \
+	   --max_input_tokens 158 &
+done
+wait
+
diff --git a/scripts/decode/en-ja/mistral-ve/top_p_inference.sh b/scripts/decode/en-ja/mistral-ve/top_p_inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..264af26aad7995586bb3e76b555f077963460849
--- /dev/null
+++ b/scripts/decode/en-ja/mistral-ve/top_p_inference.sh
@@ -0,0 +1,16 @@
+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+
+python /code/llm-recipes/tools/hf_inference_distrubuted.py \
+       --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-ve-sim-chunked-docs-all-averaged-596-600 \
+       -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \
+       -o /work/translation/wmt2024_test/en-ja/mistral-ve-top-p-0.95 \
+       -g 0 1 2 3 4 5 6 7 \
+       -b 125 \
+       --attn_implementation sdpa \
+       --dynamic_max_new_token_ratio 2.0 \
+       --num_return_sequences 80 \
+       --do_sample \
+       --top_p 0.95 \
+       --max_input_tokens 125
diff --git a/scripts/decode/en-ja/mistral-ve/top_p_inference_cpo.sh b/scripts/decode/en-ja/mistral-ve/top_p_inference_cpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e3080622a2320beb3c4da8e234930417eafdedfb
--- /dev/null
+++ b/scripts/decode/en-ja/mistral-ve/top_p_inference_cpo.sh
@@ -0,0 +1,17 @@
+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+
+python /code/llm-recipes/tools/hf_inference_distrubuted.py \
+       --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-ve-sim-chunked-docs-all-averaged-596-600 \
+       -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl \
+       -o /work/translation/wmt2024_test/en-ja/mistral-ve-top-p-0.95-cpo \
+       -p /work/models/dpo/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-ve-sim-chunked-docs-all-cpo-lora/checkpoint-200 \
+       -g 0 1 2 3 4 5 6 7 \
+       -b 125 \
+       --attn_implementation sdpa \
+       --dynamic_max_new_token_ratio 2.0 \
+       --num_return_sequences 80 \
+       --do_sample \
+       --top_p 0.95 \
+       --max_input_tokens 125 \
diff --git a/scripts/decode/en-ja/mistral/top_p_inference_2.sh b/scripts/decode/en-ja/mistral/top_p_inference_2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0cfe66af2a59dd2dd679fc7e21c5e38147e7791b
--- /dev/null
+++ b/scripts/decode/en-ja/mistral/top_p_inference_2.sh
@@ -0,0 +1,20 @@
+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+
+for i in `seq 8 9`; do
+    # minus 2 for gpu id
+    GPU_ID=$((i-2))
+    python /code/llm-recipes/tools/hf_inference.py \
+	   --model /work/models/translation_finetuned_hf/mistral-llm-recipes-en-ja-continuous-pretrained-v1-dev-finetune-chunked-docs-all-averaged-71-75 \
+	   -i /work/wmt2024_test/LLM/split/en-ja/wmttest2024.src.sentence_splited.with_template.en-ja.en.jsonl.0${i}  \
+	   -o /work/translation/wmt24_test/en-ja/mistral-top-p-0.95/split_0${i} \
+	   -g ${GPU_ID} \
+	   -b 400 \
+	   --attn_implementation sdpa \
+	   --dynamic_max_new_token_ratio 3.0 \
+	   --num_return_sequences 100 \
+	   --do_sample \
+	   --top_p 0.95 &
+done
+wait
diff --git a/scripts/yans/lm-evaluation-harness/.github/workflows/new_tasks.yml b/scripts/yans/lm-evaluation-harness/.github/workflows/new_tasks.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b748aab5c06533fd3f8d41cfd519841a9af93f75
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/.github/workflows/new_tasks.yml
@@ -0,0 +1,72 @@
+name: Tasks Modified
+
+on:
+  push:
+    branches:
+      - 'main'
+  pull_request:
+    branches:
+      - 'main'
+  workflow_dispatch:
+# comment/edit out the above to stop/change the triggers
+jobs:
+  changed_files:
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    timeout-minutes: 120
+    name: Scan for changed tasks
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
+
+      # Uses the tj-actions/changed-files action to check for changes.
+      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
+      # The `files_yaml` input optionally takes a yaml string to specify filters,
+      # and prepends the filter name to the standard output names.
+      - name: Check task folders
+        id: changed-tasks
+        uses: tj-actions/changed-files@v44.5.2
+        with:
+          # tasks checks the tasks folder and api checks the api folder for changes
+          files_yaml: |
+            tasks:
+              - lm_eval/tasks/**
+            api:
+              - lm_eval/api/**
+          write_output_files: true
+
+    # The next step is optional; the files are written to the workspace by default (above).
+    # so it's just for debugging
+      - name: Run Tests
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
+          echo "One or more test file(s) has changed."
+          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+
+      - name: Set up Python 3.9
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+          cache: 'pip'
+          cache-dependency-path: setup.py
+      - name: Install dependencies
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: |
+            python -m pip install --upgrade pip
+            pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu
+    #   Install optional git dependencies
+    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest
+        # if new tasks are added, run tests on them
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        run: python -m pytest tests/test_tasks.py -s -vv
+        # if api is modified, run tests on it
+      - name: Test more tasks with pytest
+        env:
+          API: true
+        if: steps.changed-tasks.outputs.api_any_modified == 'true'
+        run: python -m pytest tests/test_tasks.py -s -vv
diff --git a/scripts/yans/lm-evaluation-harness/.github/workflows/publish.yml b/scripts/yans/lm-evaluation-harness/.github/workflows/publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..be3481754e270f28bcb65e8c75b880aa7ebf2bac
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/.github/workflows/publish.yml
@@ -0,0 +1,78 @@
+name: Publish Python distribution to PyPI
+
+on:
+  push:
+    tags:
+      - '*'
+
+jobs:
+  build:
+    name: Build distribution
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.x"
+
+    - name: Install pypa/build
+      run: >-
+        python3 -m
+        pip install
+        build
+        --user
+    - name: Build a binary wheel and a source tarball
+      run: python3 -m build
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+
+  publish-to-pypi:
+    name: >-
+      Publish Python distribution to PyPI
+    if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes
+    needs:
+    - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/lm_eval
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+
+  publish-to-testpypi:
+    name: Publish Python distribution to TestPyPI
+    needs:
+    - build
+    runs-on: ubuntu-latest
+
+    environment:
+      name: testpypi
+      url: https://test.pypi.org/p/lm_eval
+
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution to TestPyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        repository-url: https://test.pypi.org/legacy/
diff --git a/scripts/yans/lm-evaluation-harness/.github/workflows/unit_tests.yml b/scripts/yans/lm-evaluation-harness/.github/workflows/unit_tests.yml
new file mode 100644
index 0000000000000000000000000000000000000000..49b85fb9a4541f6c6dfecd4395a4544dc4ec5aac
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/.github/workflows/unit_tests.yml
@@ -0,0 +1,95 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+# just comment out unwanted steps to turn off the test.
+name: Unit Tests
+
+on:
+  push:
+    branches:
+      - 'main'
+  pull_request:
+    branches:
+      - 'main'
+  workflow_dispatch:
+# Jobs run concurrently and steps run sequentially within a job.
+# jobs: linter and cpu_tests. Add more jobs/steps as required.
+jobs:
+  linter:
+    name: Linters
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.8
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Pre-Commit
+      env:
+        SKIP: "no-commit-to-branch,mypy"
+
+      uses: pre-commit/action@v3.0.1
+#       # mypy turned off for now
+#    - name: Lint with mypy
+#      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
+# Job 2
+  testcpu:
+    name: CPU Tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+    timeout-minutes: 30
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[dev,sentencepiece,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+#         Install optional git dependencies
+#                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+#        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Test with pytest
+      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
+    - name: Archive artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: output_results
+        path: |
+          test_logs/*
+  testmodels:
+    name: External LM Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v4
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.8
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+    - name: Test with pytest
+      run: python -m pytest tests/models --showlocals -s -vv
+    - name: Archive artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: output_results
+        path: |
+          test_logs/*
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__init__.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/__init__.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd8d5da70352396b88119f1cd2c83f04b44dda96
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/filter.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/filter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..600b26076c58d09bf2afd2f42873a287dfd3827f
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/filter.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/group.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/group.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..adb21c6123f66da878826f08203d06c2ee84bafa
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/group.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/instance.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/instance.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61f695e8d01399953a4e6337655904008c885020
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/instance.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/metrics.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/metrics.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..539173175d4d22ee449401c400b33aa776b35880
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/metrics.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/model.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba146834fe2c7611fef4b73d6f56056d23e63794
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/model.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/registry.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/registry.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b7035b7408d03e277256a87ecd5c3511ca957e5
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/registry.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/samplers.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/samplers.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9eef8d4a193e4a1e324db212fbbc2ec43961b613
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/samplers.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/task.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/task.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f16dab506fe90e374c3fd993ba602ba732ef0dc
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/api/__pycache__/task.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/filter.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d9db6821724c497c4a27116a1238e3b8d32ae29
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/filter.py
@@ -0,0 +1,56 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Callable, Iterable, List, Union
+
+from lm_eval.api.instance import Instance
+
+
+class Filter(ABC):
+    """
+    Filter classes operate on a per-task level.
+    They take all model outputs (`instance.resps` for all `task.instances`)
+    across all instances of a task, and perform operations.
+    In a single run, one can configure any number of separate filters or lists of filters.
+
+    """
+
+    def __init__(self, **kwargs) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+
+    @abstractmethod
+    def apply(self, resps: Union[List, Iterable], docs: List[dict]) -> Iterable:
+        """
+        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
+        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
+        if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
+        [<filtered resps for instance 0>, <filtered resps for instance 1>]
+        """
+        return resps
+
+
+@dataclass
+class FilterEnsemble:
+    """
+    FilterEnsemble creates a pipeline applying multiple filters.
+    Its intended usage is to stack multiple post-processing steps in order.
+    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
+    pipeline separately.
+    """
+
+    name: str
+    filters: List[Callable[[], Filter]]
+
+    def apply(self, instances: List[Instance]) -> None:
+        resps, docs = zip(*((inst.resps, inst.doc) for inst in instances))
+        resps, docs = list(resps), list(docs)
+
+        for f in self.filters:
+            # apply filters in sequence
+            resps = f().apply(resps, docs)
+
+        # add the end results after filtering to filtered_requests of their respective source instances.
+        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
+        for inst, resp in zip(instances, resps):
+            inst.filtered_resps[self.name] = resp
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/group.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/group.py
new file mode 100644
index 0000000000000000000000000000000000000000..534e6ad0103ee5aa79c6badc5550b1b355b718f7
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/group.py
@@ -0,0 +1,117 @@
+import abc
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import Any, Callable, List, Optional, Union
+
+
+@dataclass
+class AggMetricConfig(dict):
+    metric: Optional[str] = None
+    aggregation: Optional[str] = "mean"
+    weight_by_size: Optional[str] = False
+    # list of filter names which should be incorporated into the aggregated metric.
+    filter_list: Optional[Union[str, list]] = "none"
+
+    def __post_init__(self):
+        if self.aggregation != "mean":
+            raise ValueError(
+                f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{self.aggregation}'."
+            )
+
+        if isinstance(self.filter_list, str):
+            self.filter_list = [self.filter_list]
+
+
+@dataclass
+class GroupConfig(dict):
+    group: Optional[str] = None
+    group_alias: Optional[str] = None
+    task: Optional[Union[str, list]] = None
+    aggregate_metric_list: Optional[
+        Union[List[AggMetricConfig], AggMetricConfig, dict]
+    ] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+
+    def __post_init__(self):
+        if self.aggregate_metric_list is not None:
+            if isinstance(self.aggregate_metric_list, dict):
+                self.aggregate_metric_list = [self.aggregate_metric_list]
+
+            self.aggregate_metric_list = [
+                AggMetricConfig(**item) if isinstance(item, dict) else item
+                for item in self.aggregate_metric_list
+            ]
+
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+
+
+class ConfigurableGroup(abc.ABC):
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+    ) -> None:
+        self._config = GroupConfig(**config)
+
+    @property
+    def group(self):
+        return self._config.group
+
+    @property
+    def group_alias(self):
+        return self._config.group_alias
+
+    @property
+    def version(self):
+        return self._config.version
+
+    @property
+    def config(self):
+        return self._config.to_dict()
+
+    @property
+    def group_name(self) -> Any:
+        return self._config.group
+
+    def __repr__(self):
+        return (
+            f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})"
+        )
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/instance.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3c6afa0644e729ba441728c72a2469fdad07b8f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/instance.py
@@ -0,0 +1,38 @@
+from dataclasses import dataclass, field
+from typing import Literal, Optional, Tuple
+
+
+OutputType = Literal[
+    "loglikelihood", "loglikelihood_rolling", "generate_until", "multiple_choice"
+]
+
+
+@dataclass
+class Instance:
+    request_type: OutputType
+    doc: dict
+    arguments: tuple
+    idx: int
+    metadata: Tuple[Optional[str], Optional[int], Optional[int]] = field(
+        default_factory=lambda: (None, None, None)
+    )
+    resps: list = field(default_factory=list)
+    filtered_resps: dict = field(default_factory=dict)
+
+    # initialized after init
+    task_name: Optional[str] = None
+    doc_id: Optional[int] = None
+    repeats: Optional[int] = None
+
+    def __post_init__(self) -> None:
+        # unpack metadata field
+        self.task_name, self.doc_id, self.repeats = self.metadata
+
+    @property
+    def args(self):
+        """
+        Returns (string,) where `string` is the string to calculate loglikelihood over
+        """
+        return (
+            self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
+        )
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/metrics.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8459aa7397fd02947917dad616520bb4cb777bd
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/metrics.py
@@ -0,0 +1,570 @@
+import logging
+import math
+import random
+import re
+import string
+from collections.abc import Iterable
+from typing import List
+
+import numpy as np
+import sacrebleu
+
+from lm_eval.api.registry import register_aggregation, register_metric
+
+
+eval_logger = logging.getLogger("lm-eval")
+
+
+# Register Aggregations First
+@register_aggregation("bypass")
+def bypass_agg(arr):
+    return 999
+
+
+@register_aggregation("mean")
+def mean(arr):
+    return sum(arr) / len(arr)
+
+
+@register_aggregation("median")
+def median(arr):
+    return arr[len(arr) // 2]
+
+
+# Certain metrics must be calculated across all documents in a benchmark.
+# We use them as aggregation metrics, paired with no-op passthrough metric fns.
+@register_aggregation("perplexity")
+def perplexity(items):
+    return math.exp(-mean(items))
+
+
+@register_aggregation("weighted_perplexity")
+def weighted_perplexity(items):
+    return math.exp(-weighted_mean(items))
+
+
+@register_aggregation("bits_per_byte")
+def bits_per_byte(items):
+    return -weighted_mean(items) / math.log(2)
+
+
+@register_aggregation("f1")
+def f1_score(items):
+    from sklearn.metrics import f1_score
+
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds)
+
+    return np.max(fscore)
+
+
+@register_aggregation("matthews_corrcoef")
+def matthews_corrcoef(items):
+    from sklearn.metrics import matthews_corrcoef
+
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    return matthews_corrcoef(golds, preds)
+
+
+@register_aggregation("bleu")
+def bleu(items):
+    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
+    for evaluating a generated sentence to a reference sentence. It counts matching
+    n-grams in the candidate translation to n-grams in the reference text, where
+    1-gram or unigram would be each token and a bigram comparison would be each
+    word pair. The comparison is made regardless of word order
+    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
+    Paper: https://www.aclweb.org/anthology/P02-1040/
+
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_bleu(preds, refs).score
+
+
+@register_aggregation("chrf")
+def chrf(items):
+    """chrF++ is a tool for automatic evaluation of machine translation output
+    based on character n-gram precision and recall enhanced with word n-grams.
+    Source: https://github.com/m-popovic/chrF
+    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
+
+    Higher is better  # TODO I think
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_chrf(preds, refs).score
+
+
+@register_aggregation("ter")
+def ter(items):
+    """Translation Error Rate is an error metric for machine translation that
+    measures the number of edits required to change a system output into one
+    of the references
+    Source: http://www.cs.umd.edu/~snover/tercom/
+    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
+
+    Lower is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_ter(preds, refs).score
+
+
+@register_aggregation("brier_score")
+def brier_score(items):  # This is a passthrough function
+    gold, predictions = list(zip(*items))
+    bs, num_class = np.array(predictions).shape
+
+    gold = list(gold)
+    gold_one_hot = np.eye(num_class)[gold]
+    return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1))
+
+
+@register_metric(
+    metric="brier_score",
+    higher_is_better=False,
+    output_type=["multiple_choice"],
+    aggregation="brier_score",
+)
+def brier_score_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="acc",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="acc_norm",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_norm_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="acc_mutual_info",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="mean",
+)
+def acc_mutual_info_fn(items):  # This is a passthrough function
+    return items
+
+
+### the code used in the `exact_match_hf_evaluate` function is ported from
+### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
+### which is under the apache license.
+
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def exact_match_hf_evaluate(
+    predictions,
+    references,
+    regexes_to_ignore=None,
+    ignore_case=False,
+    ignore_punctuation=False,
+    ignore_numbers=False,
+):
+    if regexes_to_ignore is not None:
+        for s in regexes_to_ignore:
+            predictions = np.array([re.sub(s, "", x) for x in predictions])
+            references = np.array([re.sub(s, "", x) for x in references])
+    else:
+        predictions = np.asarray(predictions)
+        references = np.asarray(references)
+
+    if ignore_case:
+        predictions = np.char.lower(predictions)
+        references = np.char.lower(references)
+
+    if ignore_punctuation:
+        repl_table = string.punctuation.maketrans("", "", string.punctuation)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+
+    if ignore_numbers:
+        repl_table = string.digits.maketrans("", "", string.digits)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+
+    score_list = predictions == references
+
+    return {"exact_match": np.mean(score_list)}
+
+
+###
+
+
+@register_metric(
+    metric="exact_match",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def exact_match_fn(**kwargs):
+    return exact_match_hf_evaluate(**kwargs)
+
+
+@register_metric(
+    metric="perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood",
+    aggregation="perplexity",
+)
+def perplexity_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="word_perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="weighted_perplexity",
+)
+def word_perplexity_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="byte_perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="weighted_perplexity",
+)
+def byte_perplexity_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="bits_per_byte",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="bits_per_byte",
+)
+def bits_per_byte_fn(items):  # This is a passthrough function
+    return items
+
+
+def pop_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
+
+
+def sample_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
+
+
+def mean_stderr(arr):
+    return sample_stddev(arr) / math.sqrt(len(arr))
+
+
+@register_metric(
+    metric="bypass",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice", "generate_until"],
+    aggregation="bypass",
+)
+def bypass(items):
+    return None
+
+
+@register_metric(
+    metric="mcc",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="matthews_corrcoef",
+)
+def mcc_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="f1",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="f1",
+)
+def f1_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="bleu",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="bleu",
+)
+def bleu_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="chrf",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="chrf",
+)
+def chrf_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="ter",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="ter",
+)
+def ter_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="acc_all",
+    higher_is_better=True,
+    output_type="loglikelihood",
+    aggregation="mean",
+)
+def acc_all(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+
+    for doc, pred in zip(docs, preds):
+        paragraph_id = doc["idx"]["paragraph"]
+        question_id = doc["idx"]["question"]
+        if (paragraph_id, question_id) not in question_scoring_dict:
+            question_scoring_dict[(paragraph_id, question_id)] = []
+
+        gold_label = doc["label"] == 1
+
+        question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
+    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+
+
+def acc_all_stderr(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+
+    for doc, pred in zip(docs, preds):
+        question_id = doc["idx"]["question"]
+        if question_id not in question_scoring_dict:
+            question_scoring_dict[question_id] = []
+
+        gold_label = doc["label"] == 1
+        question_scoring_dict[question_id].append(gold_label == pred)
+
+    acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    """Compute max metric between prediction and each ground truth."""
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def weighted_mean(items):
+    a, b = zip(*items)
+    return sum(a) / sum(b)
+
+
+def is_non_str_iterable(obj):
+    return isinstance(obj, Iterable) and not isinstance(obj, str)
+
+
+def _sacreformat(refs, preds):
+    """Format refs and preds for sacrebleu corpus calculation. It is very particular"""
+    # Sacrebleu expects (List[str], List[List[str])
+    #   e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
+
+    # Note [ref1_stream] is the first reference for each pred.
+    # So lists are size N and (M, N) for N preds and M possible refs for each pred
+    # This is a different order of dimensions that I would expect
+
+    # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
+    # Must become List[List[str]] with the inner list corresponding to preds
+    if not is_non_str_iterable(refs):
+        refs = list(refs)
+    if not is_non_str_iterable(refs[0]):
+        refs = [[ref] for ref in refs]
+    refs = list(zip(*refs))
+    # Note the number of refs in each ref list much match the number of preds
+
+    # We expect preds to be List[str] or List[List[str]]. Must become List[str]
+    if not is_non_str_iterable(preds):
+        preds = list(preds)
+    if is_non_str_iterable(preds[0]):
+        assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
+        preds = [pred[0] for pred in preds]
+
+    return refs, preds
+
+
+# stderr stuff
+
+
+class _bootstrap_internal:
+    def __init__(self, f, n) -> None:
+        self.f = f
+        self.n = n
+
+    def __call__(self, v):
+        i, xs = v
+        rnd = random.Random()
+        rnd.seed(i)
+        res = []
+        for _ in range(self.n):
+            res.append(self.f(rnd.choices(xs, k=len(xs))))
+        return res
+
+
+def bootstrap_stderr(f, xs, iters):
+    import multiprocessing as mp
+
+    pool = mp.Pool(mp.cpu_count())
+    # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
+    # equivalent to stderr calculated without Bessel's correction in the stddev.
+    # Unfortunately, I haven't been able to figure out what the right correction is
+    # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
+    # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
+    # Thankfully, shouldn't matter because our samples are pretty big usually anyways
+    res = []
+    chunk_size = min(1000, iters)
+    from tqdm import tqdm
+
+    print("bootstrapping for stddev:", f.__name__)
+    for bootstrap in tqdm(
+        pool.imap(
+            _bootstrap_internal(f, chunk_size),
+            [(i, xs) for i in range(iters // chunk_size)],
+        ),
+        total=iters // chunk_size,
+    ):
+        # sample w replacement
+        res.extend(bootstrap)
+
+    pool.close()
+    return sample_stddev(res)
+
+
+def stderr_for_metric(metric, bootstrap_iters: int):
+    if bootstrap_iters <= 0:
+        # return no function (don't compute stderr) if bootstrap iters = 0
+        return None
+
+    bootstrappable = [
+        median,
+        matthews_corrcoef,
+        f1_score,
+        perplexity,
+        bleu,
+        chrf,
+        ter,
+    ]
+
+    if metric in bootstrappable:
+        return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
+
+    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
+
+    return stderr.get(metric, None)
+
+
+def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
+    # Used to aggregate bootstrapped stderrs across subtasks in a group,
+    # when we are weighting by the size of each subtask.
+    #
+
+    assert len(stderrs) == len(sizes)
+
+    # formula source: https://en.wikipedia.org/wiki/Pooled_variance
+    # and: https://stats.stackexchange.com/a/4841331
+    # this empirically seems to match running `stderr_for_metric` on all instances
+    # from the subtasks concatenated with each other.
+    pooled_sample_var = (
+        sum([(size - 1) * stderr**2 * size for size, stderr in zip(sizes, stderrs)])
+    ) / (sum(sizes) - len(sizes))
+
+    return np.sqrt(pooled_sample_var / sum(sizes))
+
+
+def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
+    assert (
+        metrics is not None
+    ), "Need to pass a list of each subtask's metric for this stderr aggregation"
+    assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
+
+    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.
+    # This formula depends on sample means.
+    # removed because it seems to give erroneously huge stderrs for groupings of tasks
+    # and does not seem to match up with bootstrap-calculated stderrs for groups.
+
+    ### don't use this unless a statistician has told you it's the right thing to do ###
+
+    # accumulators: we'll aggregate pairwise N - 1 times
+    variance = stderrs[0] ** 2
+    curr_size = sizes[0]
+    curr_score = metrics[0]
+
+    for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]):
+        curr_score = ((curr_score * curr_size) + (score * size)) / (
+            curr_size + size
+        )  # NOTE: this assumes our aggregation fn is "mean"
+
+        variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / (
+            curr_size + size - 1
+        ) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * (
+            curr_score - score
+        ) ** 2
+
+    return np.sqrt(variance)
+
+
+def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
+    # A helper function that is used to aggregate
+    # subtask scores cross-task.
+    # TODO: does not hold for non-mean aggregations
+    if not weight_by_size:
+        sizes = [1] * len(sizes)
+
+    assert len(metrics) == len(sizes)
+
+    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/model.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3602736d230b196eac4d384978ae1b62b7b4fe2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/model.py
@@ -0,0 +1,385 @@
+import abc
+import hashlib
+import json
+import logging
+import os
+from typing import Dict, List, Optional, Tuple, Type, TypeVar
+
+import transformers
+from sqlitedict import SqliteDict
+from tqdm import tqdm
+
+from lm_eval import utils
+
+
+eval_logger = logging.getLogger("lm-eval")
+
+T = TypeVar("T", bound="LM")
+
+
+class LM(abc.ABC):
+    def __init__(self) -> None:
+        """Defines the interface that should be implemented by all LM subclasses.
+        LMs are assumed to take text (strings) as input and yield strings as output
+        (inputs/outputs should be tokenization-agnostic.)
+
+        """
+        # set rank and world size to a single process, by default.
+        self._rank = 0
+        self._world_size = 1
+        self.cache_hook = CacheHook(None)
+
+    @abc.abstractmethod
+    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+        """Compute log-likelihood of generating a continuation from a context.
+        Downstream tasks should attempt to use loglikelihood instead of other
+        LM calls whenever possible.
+
+        :param requests: list[Instance]
+            A list of Instance objects, with property `args` which returns a tuple (context, continuation).
+            `context: str`
+                Context string. Implementations of LM must be able to handle an
+                empty context string.
+            `continuation: str`
+                The continuation over which log likelihood will be calculated. If
+                there is a word boundary, the space should be in the continuation.
+                For example, context="hello" continuation=" world" is correct.
+
+        :return: list[tuple[float, bool]]
+            A list of pairs (logprob, isgreedy)
+            `logprob: float`
+                The log probability of `continuation`.
+            `isgreedy`:
+                Whether `continuation` would be generated by greedy sampling from `context`.
+        """
+        pass
+
+    @abc.abstractmethod
+    def loglikelihood_rolling(self, requests) -> List[float]:
+        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
+        - We will use the full max context length of the model.
+        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
+        the max context length.
+        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
+          which may simply concatenate multiple documents together.
+        - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
+          multiple chunks, the last input will still a full-sized context.
+          Example:
+            Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
+            Prefix: BOS/EOS
+            Max context length: 4
+            Resulting input/prediction pairs:
+
+                INPUT:  BOS   0   1   2
+                PRED:     0   1   2   3
+
+                INPUT:    3   4   5   6
+                PRED:     4   5   6   7
+
+                INPUT:    5   6   7   8
+                PRED:             8   9
+
+          Observe that:
+            1. Each token is predicted exactly once
+            2. For the last pair, we provide the full context, but only score the last two tokens
+
+        :param requests: list[Instance]
+            A list of Instance objects with property `args` which returns a tuple (context,).
+            string: str
+                String for which we are computing overall loglikelihood
+        :return: list[tuple[float]]
+            A list of tuples (logprob,)
+            logprob: float
+                The log probability of `context` conditioned on the BOS/EOS token.
+                Can also be overridden for custom cases by `prefix_token_id`.
+        """
+        pass
+
+    # TODO: Add an optional max length
+    @abc.abstractmethod
+    def generate_until(self, requests) -> List[str]:
+        """Generate greedily until a stopping sequence
+
+        :param requests: list[Instance]
+            A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs).
+            context: str
+                Context string
+            gen_kwargs: dict
+                A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc.
+        :return: list[str]
+            A list of model generated continuations.
+            continuation: str
+                The generated continuation.
+        """
+        pass
+
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
+
+        :param chat_history: list[dict[str, str]]
+            A list of dictionaries with keys 'role' and 'content'.
+            Values are strings representing the role name and the content of the message, respectively.
+        :return: str
+            A string representing the chat history in a format that can be used as input to the LM.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'apply_chat_template' method for your model type."
+        )
+
+    @classmethod
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+
+        Returns:
+        - Instance of the LM class.
+        """
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+
+    @classmethod
+    def create_from_arg_obj(
+        cls: Type[T], arg_dict: dict, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given arg_obj
+
+        Parameters:
+        - arg_obj: A dict containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+
+        Returns:
+        - Instance of the LM class.
+        """
+
+        additional_config = {} if additional_config is None else additional_config
+        additional_config = {
+            k: v for k, v in additional_config.items() if v is not None
+        }
+
+        return cls(**arg_dict, **additional_config)
+
+    @property
+    def rank(self):
+        # used in the case of parallelism. Hardcoded to
+        # ensure no errors arise using API models which do
+        # not support multi-device parallelism nor expect it.
+        return self._rank
+
+    @property
+    def world_size(self):
+        # used in the case of parallelism. Hardcoded to
+        # ensure no errors arise using API models which do
+        # not support multi-device parallelism nor expect it.
+        return self._world_size
+
+    @property
+    def tokenizer_name(self) -> str:
+        """Must be defined for LM subclasses which implement Chat Templating.
+        Should return the name of the tokenizer or chat template used.
+        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'tokenizer_name' property."
+        )
+
+    @property
+    def chat_template(self) -> str:
+        """Must be defined for LM subclasses that implement Chat Templating.
+        Should return the structure of the chat template applied to user/assistant messages.
+        This is used only to save in the experiment results for reproducibility.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'chat_template' property."
+        )
+
+    def set_cache_hook(self, cache_hook) -> None:
+        self.cache_hook = cache_hook
+
+
+### SQLite-based caching of LM responses
+def hash_args(attr, args):
+    dat = json.dumps([attr] + list(args))
+    return hashlib.sha256(dat.encode("utf-8")).hexdigest()
+
+
+class CacheHook:
+    def __init__(self, cachinglm) -> None:
+        if cachinglm is None:
+            self.dbdict = None
+            return
+
+        self.dbdict = cachinglm.dbdict
+
+    def add_partial(self, attr, req, res) -> None:
+        if self.dbdict is None:
+            return
+        hsh = hash_args(attr, req)
+        self.dbdict[hsh] = res
+
+
+class CachingLM:
+    def __init__(self, lm, cache_db) -> None:
+        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
+
+        :param lm: LM
+            Underlying LM
+        :param cache_db: str
+            Path to cache db
+        """
+        self.lm = lm
+        self.cache_db = cache_db
+        if os.path.dirname(cache_db):
+            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
+        self.dbdict = SqliteDict(cache_db, autocommit=True)
+
+        # add hook to lm
+        lm.set_cache_hook(self.get_cache_hook())
+
+    def __getattr__(self, attr: str):
+        lm_attr = getattr(self.lm, attr)
+        if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
+            eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
+            return lm_attr
+
+        def fn(requests):
+            res = []
+            remaining_reqs = []
+            warned = False
+            # figure out which ones are cached and which ones are new
+            eval_logger.info(
+                f"Loading '{attr}' responses from cache '{self.cache_db}' where possible..."
+            )
+            for req in tqdm(requests, desc="Checking cached requests"):
+                hsh = hash_args(attr, req.args)
+                if attr == "generate_until" and req.args[1].get("do_sample", False):
+                    # when we are doing non-greedy generation, don't use the cache
+                    # (else every "randomly sampled" generation would be identical for repeats > 1).
+                    if not warned:
+                        eval_logger.warning(
+                            f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
+                        )
+                        warned = True
+                    res.append(None)
+                    remaining_reqs.append(req)
+                elif hsh in self.dbdict:
+                    ob = self.dbdict[hsh]
+
+                    assert ob is not None
+
+                    res.append(ob)
+                else:
+                    res.append(None)
+                    remaining_reqs.append(req)
+            eval_logger.info(
+                f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}"
+            )
+            # actually run the LM on the requests that do not have cached results
+            rem_res = getattr(self.lm, attr)(remaining_reqs)
+
+            # stick the new ones back into the list and also cache any of the new ones
+            resptr = 0
+            for req, r in zip(remaining_reqs, rem_res):
+                while res[resptr] is not None:
+                    resptr += 1
+
+                res[resptr] = r
+
+                # caching
+                hsh = hash_args(attr, req.args)
+                self.dbdict[hsh] = r
+            self.dbdict.commit()
+
+            return res
+
+        return fn
+
+    def get_cache_hook(self):
+        return CacheHook(self)
+
+
+class TemplateLM(LM):
+    """
+    A class acting as intermediary between the LM base class
+    and boilerplate often included in other LM subclasses.
+    """
+
+    @property
+    @abc.abstractmethod
+    def eot_token_id(self):
+        pass
+
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        return self.eot_token_id
+
+    @abc.abstractmethod
+    def tok_encode(self, string: str, **kwargs) -> List[int]:
+        """
+        Tokenize a string using the model's tokenizer and return a list of token IDs.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        pass
+
+    def _encode_pair(
+        self, context: str, continuation: str
+    ) -> Tuple[List[int], List[int]]:
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+
+        model_class = getattr(self, "AUTO_MODEL_CLASS", None)
+
+        if model_class == transformers.AutoModelForSeq2SeqLM:
+            context_enc = self.tok_encode(context)
+            continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
+        else:
+            whole_enc = self.tok_encode(context + continuation)
+            context_enc = self.tok_encode(context)
+
+            context_enc_len = len(context_enc)
+            continuation_enc = whole_enc[context_enc_len:]
+
+        return context_enc, continuation_enc
+
+    def loglikelihood(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                # BOS or EOS as context
+                context_enc, continuation_enc = (
+                    [self.prefix_token_id],
+                    self.tok_encode(continuation),
+                )
+            else:
+                context_enc, continuation_enc = self._encode_pair(context, continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
+
+    @abc.abstractmethod
+    def loglikelihood_rolling(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[float]:
+        pass
+
+    @abc.abstractmethod
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        pass
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/registry.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..7446a429e61d9b287c384b5be5db2a258ea83ae8
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/registry.py
@@ -0,0 +1,192 @@
+import logging
+from typing import Callable, Dict
+
+import evaluate as hf_evaluate
+
+from lm_eval.api.model import LM
+
+
+eval_logger = logging.getLogger("lm-eval")
+
+MODEL_REGISTRY = {}
+
+
+def register_model(*names):
+    # either pass a list or a single alias.
+    # function receives them as a tuple of strings
+
+    def decorate(cls):
+        for name in names:
+            assert issubclass(
+                cls, LM
+            ), f"Model '{name}' ({cls.__name__}) must extend LM class"
+
+            assert (
+                name not in MODEL_REGISTRY
+            ), f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
+
+            MODEL_REGISTRY[name] = cls
+        return cls
+
+    return decorate
+
+
+def get_model(model_name):
+    try:
+        return MODEL_REGISTRY[model_name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}"
+        )
+
+
+TASK_REGISTRY = {}
+GROUP_REGISTRY = {}
+ALL_TASKS = set()
+func2task_index = {}
+
+
+def register_task(name):
+    def decorate(fn):
+        assert (
+            name not in TASK_REGISTRY
+        ), f"task named '{name}' conflicts with existing registered task!"
+
+        TASK_REGISTRY[name] = fn
+        ALL_TASKS.add(name)
+        func2task_index[fn.__name__] = name
+        return fn
+
+    return decorate
+
+
+def register_group(name):
+    def decorate(fn):
+        func_name = func2task_index[fn.__name__]
+        if name in GROUP_REGISTRY:
+            GROUP_REGISTRY[name].append(func_name)
+        else:
+            GROUP_REGISTRY[name] = [func_name]
+            ALL_TASKS.add(name)
+        return fn
+
+    return decorate
+
+
+OUTPUT_TYPE_REGISTRY = {}
+METRIC_REGISTRY = {}
+METRIC_AGGREGATION_REGISTRY = {}
+AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
+HIGHER_IS_BETTER_REGISTRY = {}
+FILTER_REGISTRY = {}
+
+DEFAULT_METRIC_REGISTRY = {
+    "loglikelihood": [
+        "perplexity",
+        "acc",
+    ],
+    "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
+    "multiple_choice": ["acc", "acc_norm"],
+    "generate_until": ["exact_match"],
+}
+
+
+def register_metric(**args):
+    # TODO: do we want to enforce a certain interface to registered metrics?
+    def decorate(fn):
+        assert "metric" in args
+        name = args["metric"]
+
+        for key, registry in [
+            ("metric", METRIC_REGISTRY),
+            ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
+            ("aggregation", METRIC_AGGREGATION_REGISTRY),
+        ]:
+            if key in args:
+                value = args[key]
+                assert (
+                    value not in registry
+                ), f"{key} named '{value}' conflicts with existing registered {key}!"
+
+                if key == "metric":
+                    registry[name] = fn
+                elif key == "aggregation":
+                    registry[name] = AGGREGATION_REGISTRY[value]
+                else:
+                    registry[name] = value
+
+        return fn
+
+    return decorate
+
+
+def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
+    if not hf_evaluate_metric:
+        if name in METRIC_REGISTRY:
+            return METRIC_REGISTRY[name]
+        else:
+            eval_logger.warning(
+                f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
+            )
+
+    try:
+        metric_object = hf_evaluate.load(name)
+        return metric_object.compute
+    except Exception:
+        eval_logger.error(
+            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
+        )
+
+
+def register_aggregation(name: str):
+    def decorate(fn):
+        assert (
+            name not in AGGREGATION_REGISTRY
+        ), f"aggregation named '{name}' conflicts with existing registered aggregation!"
+
+        AGGREGATION_REGISTRY[name] = fn
+        return fn
+
+    return decorate
+
+
+def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
+    try:
+        return AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(f"{name} not a registered aggregation metric!")
+
+
+def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
+    try:
+        return METRIC_AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(f"{name} metric is not assigned a default aggregation!")
+
+
+def is_higher_better(metric_name) -> bool:
+    try:
+        return HIGHER_IS_BETTER_REGISTRY[metric_name]
+    except KeyError:
+        eval_logger.warning(
+            f"higher_is_better not specified for metric '{metric_name}'!"
+        )
+
+
+def register_filter(name):
+    def decorate(cls):
+        if name in FILTER_REGISTRY:
+            eval_logger.info(
+                f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}"
+            )
+        FILTER_REGISTRY[name] = cls
+        return cls
+
+    return decorate
+
+
+def get_filter(filter_name: str) -> type:
+    try:
+        return FILTER_REGISTRY[filter_name]
+    except KeyError:
+        eval_logger.warning(f"filter `{filter_name}` is not registered!")
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/samplers.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/samplers.py
new file mode 100644
index 0000000000000000000000000000000000000000..94e101729c8eb48dc10066a0114f3ba1f60a1307
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/samplers.py
@@ -0,0 +1,198 @@
+from functools import partial
+
+import datasets
+
+
+class ContextSampler:
+    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
+        self.rnd = rnd
+        if not self.rnd:
+            raise ValueError(
+                "A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
+            )
+
+        self.task = task
+        self.config = task._config
+
+        self.target_delimiter = self.config.target_delimiter
+        self.fewshot_delimiter = self.config.fewshot_delimiter
+
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_text", None) is not None
+        ):
+            self.doc_to_text = partial(
+                self.task.doc_to_text,
+                doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
+            )
+        else:
+            self.doc_to_text = self.task.doc_to_text
+
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_target", None) is not None
+        ):
+            self.doc_to_target = partial(
+                self.task.doc_to_target,
+                doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
+            )
+        else:
+            self.doc_to_target = self.task.doc_to_target
+
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_choice", None) is not None
+        ):
+            self.doc_to_choice = partial(
+                self.task.doc_to_choice,
+                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
+            )
+        else:
+            self.doc_to_choice = self.task.doc_to_choice
+
+        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
+        if fewshot_indices:  # subset few-shot docs from
+            if not isinstance(self.docs, datasets.Dataset):
+                raise ValueError(
+                    "Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously"
+                )
+            self.docs = self.docs.select(fewshot_indices)
+
+    def get_context(self, doc, num_fewshot):
+        # draw an extra fewshot sample if using same split as evaluating on
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
+
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+
+        labeled_examples = ""
+        for doc in selected_docs:
+            doc_content = self.doc_to_text(doc)
+            doc_target = self.doc_to_target(doc)
+            labeled_examples += (
+                doc_content
+                if self.config.doc_to_choice is None or isinstance(doc_content, str)
+                else self.doc_to_choice(doc)[doc_content]
+            )
+            labeled_examples += self.target_delimiter
+            if doc_target != "":
+                labeled_examples += (
+                    str(doc_target[0])
+                    if isinstance(doc_target, list)
+                    else doc_target
+                    if self.config.doc_to_choice is None or isinstance(doc_target, str)
+                    else str(self.doc_to_choice(doc)[doc_target])
+                )
+                labeled_examples += self.fewshot_delimiter
+
+        return labeled_examples
+
+    def get_chat_context(
+        self,
+        doc,
+        num_fewshot,
+        fewshot_as_multiturn: bool = False,
+    ):
+        chat_history = []
+        # draw an extra fewshot sample if using same split as evaluating on
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+
+        if fewshot_as_multiturn:
+            for doc in selected_docs:
+                doc_content = self.doc_to_text(doc)
+                doc_target = self.doc_to_target(doc)
+                chat_history.append(
+                    {
+                        "role": "user",
+                        "content": doc_content
+                        if self.config.doc_to_choice is None
+                        or isinstance(doc_content, str)
+                        else self.doc_to_choice(doc)[doc_content],
+                    }
+                )
+                chat_history.append(
+                    {
+                        "role": "assistant",
+                        "content": str(doc_target[0])
+                        if isinstance(doc_target, list)
+                        else doc_target
+                        if self.config.doc_to_choice is None
+                        or isinstance(doc_target, str)
+                        else str(self.doc_to_choice(doc)[doc_target]),
+                    }
+                )
+        else:
+            # get fewshot context as one user turn
+            chat_history.append(
+                {"role": "user", "content": self.get_context(doc, num_fewshot)}
+            )
+
+        return chat_history
+
+    def sample(self, n):
+        """
+        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
+        """
+
+        return self.rnd.sample(self.docs, n)
+
+
+class FirstNSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """
+        Draw the first `n` samples in order from the specified split.
+        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
+        """
+        assert (
+            n <= len(self.docs)
+        ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        return self.docs[:n]
+
+
+class BalancedSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """
+        TODO: this should return approximately class-balanced samples from our fewshot examples.
+        TODO: what order should they be in? maybe random?
+        """
+
+        pass
+
+
+class ManualSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """ """
+        pass
+
+
+SAMPLER_REGISTRY = {
+    "default": ContextSampler,
+    "first_n": FirstNSampler,
+}
+
+
+def get_sampler(name):
+    try:
+        return SAMPLER_REGISTRY[name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}"
+        )
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/api/task.py b/scripts/yans/lm-evaluation-harness/lm_eval/api/task.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a1a3bdbafac2d1c4c2cc7764a1e988e92183c53
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/api/task.py
@@ -0,0 +1,1674 @@
+import abc
+import ast
+import logging
+import random
+import re
+from collections.abc import Callable
+from copy import deepcopy
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+)
+
+import datasets
+import numpy as np
+from tqdm import tqdm
+
+from lm_eval import utils
+from lm_eval.api import samplers
+from lm_eval.api.instance import Instance, OutputType
+from lm_eval.api.metrics import bits_per_byte, mean, weighted_perplexity
+from lm_eval.api.registry import (
+    AGGREGATION_REGISTRY,
+    DEFAULT_METRIC_REGISTRY,
+    get_aggregation,
+    get_metric,
+    get_metric_aggregation,
+    is_higher_better,
+)
+from lm_eval.caching.cache import load_from_cache, save_to_cache
+from lm_eval.filters import build_filter_ensemble
+from lm_eval.prompts import get_prompt
+
+
+ALL_OUTPUT_TYPES = [
+    "loglikelihood",
+    "multiple_choice",
+    "loglikelihood_rolling",
+    "generate_until",
+]
+
+eval_logger = logging.getLogger("lm-eval")
+
+
+@dataclass
+class TaskConfig(dict):
+    # task naming/registry
+    task: Optional[str] = None
+    task_alias: Optional[str] = None
+    tag: Optional[Union[str, list]] = None
+    group: Optional[Union[str, list]] = None
+    # HF dataset options.
+    # which dataset to use,
+    # and what splits for what purpose
+    dataset_path: Optional[str] = None
+    dataset_name: Optional[str] = None
+    dataset_kwargs: Optional[dict] = None
+    training_split: Optional[str] = None
+    validation_split: Optional[str] = None
+    test_split: Optional[str] = None
+    fewshot_split: Optional[str] = (
+        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+    )
+    # formatting / prompting options.
+    # see docs/advanced_task_guide.md for more info
+    process_docs: Optional[Callable] = None
+    doc_to_text: Optional[Union[Callable, str]] = None
+    doc_to_target: Optional[Union[Callable, str]] = None
+    doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
+    process_results: Optional[Union[Callable, str]] = None
+    use_prompt: Optional[str] = None
+    description: str = ""
+    target_delimiter: str = " "
+    fewshot_delimiter: str = "\n\n"
+    fewshot_config: Optional[dict] = None
+    # runtime configuration options
+    num_fewshot: Optional[int] = None
+    # scoring options
+    metric_list: Optional[list] = None
+    output_type: OutputType = "generate_until"
+    generation_kwargs: Optional[dict] = None
+    repeats: int = 1
+    filter_list: Optional[Union[str, list]] = None
+    should_decontaminate: bool = False
+    doc_to_decontamination_query: Optional[str] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+
+    def __post_init__(self) -> None:
+        if self.group is not None:
+            eval_logger.warning(
+                "A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information."
+            )
+
+            if self.tag is None:
+                self.tag = self.group
+            else:
+                raise ValueError(
+                    "Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4."
+                )
+
+        if self.generation_kwargs is not None:
+            if self.output_type != "generate_until":
+                eval_logger.warning(
+                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
+                )
+
+            if "temperature" in self.generation_kwargs:
+                self.generation_kwargs["temperature"] = float(
+                    self.generation_kwargs["temperature"]
+                )
+
+            if "until" not in self.generation_kwargs:
+                self.generation_kwargs["until"] = [self.fewshot_delimiter]
+        else:
+            if self.output_type == "generate_until":
+                # ensure that we greedily generate in absence of explicit arguments otherwise
+                self.generation_kwargs = {
+                    "until": (
+                        None
+                        if self.fewshot_delimiter is None
+                        else [self.fewshot_delimiter]
+                    ),
+                    "do_sample": False,
+                }
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if v is None:
+                cfg_dict.pop(k)
+            elif k == "metric_list":
+                for metric_dict in v:
+                    for metric_key, metric_value in metric_dict.items():
+                        if callable(metric_value):
+                            metric_dict[metric_key] = self.serialize_function(
+                                metric_value, keep_callable=keep_callable
+                            )
+                cfg_dict[k] = v
+            elif callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+
+
+class Task(abc.ABC):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, and evaluation methods. See BoolQ for a simple example implementation
+
+    A `doc` can be any python object which represents one instance of evaluation.
+    This is usually a dictionary e.g.
+        {"question": ..., "answer": ...} or
+        {"question": ..., question, answer)
+    """
+
+    VERSION: Optional[Union[int, str]] = None
+
+    # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
+    # or a path to a custom `datasets` loading script.
+    DATASET_PATH: Optional[str] = None
+
+    # The name of a subset within `DATASET_PATH`.
+    DATASET_NAME: Optional[str] = None
+
+    OUTPUT_TYPE: Optional[OutputType] = None
+
+    def __init__(
+        self,
+        data_dir: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        download_mode: Optional[datasets.DownloadMode] = None,
+        config: Optional[Mapping] = None,  # Union[dict, TaskConfig]
+    ) -> None:
+        """
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.download(data_dir, cache_dir, download_mode)
+        self._training_docs: Optional[list] = None
+        self._fewshot_docs: Optional[list] = None
+        self._instances: Optional[List[Instance]] = None
+
+        self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()
+
+        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        self.fewshot_rnd: Optional[random.Random] = (
+            None  # purposely induce errors in case of improper usage
+        )
+
+    def download(
+        self,
+        data_dir: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        download_mode=None,
+    ) -> None:
+        """Downloads and returns the task dataset.
+        Override this method to download the dataset from a custom API.
+
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            by setting the shell environment variable, `HF_DATASETS_CACHE`,
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.dataset = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            data_dir=data_dir,
+            cache_dir=cache_dir,
+            download_mode=download_mode,
+        )
+
+    @property
+    def config(self) -> TaskConfig:
+        """Returns the TaskConfig associated with this class."""
+        return self._config
+
+    @abc.abstractmethod
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        pass
+
+    @abc.abstractmethod
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        pass
+
+    @abc.abstractmethod
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        pass
+
+    def training_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+
+    def validation_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+
+    def test_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+
+    def fewshot_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        if self.has_training_docs():
+            return self.training_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            eval_logger.warning(
+                f"[Task: {self.config.task}] has_training_docs and has_validation_docs are False"
+                ", using test_docs as fewshot_docs but this is not recommended."
+            )
+            return self.test_docs()
+
+    def _process_doc(self, doc: dict) -> dict:
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+
+    @property
+    def instances(self) -> List[Instance]:
+        """After calling `task.build_all_requests()`, tasks
+        maintain a list of the dataset instances which will be evaluated.
+        """
+        return self._instances
+
+    def fewshot_examples(self, k, rnd):
+        if self._training_docs is None:
+            self._training_docs = list(self.training_docs())
+
+        return rnd.sample(self._training_docs, k)
+
+    def doc_to_decontamination_query(self, doc):
+        raise NotImplementedError(
+            "Override doc_to_decontamination_query with document specific decontamination query."
+        )
+
+    @abc.abstractmethod
+    def doc_to_text(self, doc):
+        pass
+
+    @abc.abstractmethod
+    def doc_to_target(self, doc):
+        pass
+
+    def build_all_requests(
+        self,
+        *,
+        limit: Union[int, None] = None,
+        rank: int = 0,
+        world_size: int = 1,
+        cache_requests: bool = False,
+        rewrite_requests_cache: bool = False,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+        tokenizer_name: str = "",
+    ) -> None:
+        """Build a set of Instances for a task, and store them in task.instances"""
+
+        # used with caching
+        og_limit = limit
+
+        cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
+        cache_key += "-chat_template" if apply_chat_template else ""
+        cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else ""
+        cache_key += (
+            f"-system_prompt_hash{utils.hash_string(system_instruction)}"
+            if system_instruction is not None
+            else ""
+        )
+        cache_key += f"-tokenizer{tokenizer_name}"
+
+        cached_instances = load_from_cache(file_name=cache_key)
+
+        if cache_requests and cached_instances and not rewrite_requests_cache:
+            cached_instances = cached_instances[:limit]
+
+            flattened_instances = [
+                instance
+                for instance_group in cached_instances
+                for instance in instance_group
+            ]
+
+            self._instances = flattened_instances
+            return
+
+        eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...")
+
+        instances = []
+
+        # process all documents when caching is specified for simplicity
+        if (
+            cache_requests
+            and (not cached_instances or rewrite_requests_cache)
+            and limit is not None
+        ):
+            limit = None
+
+        doc_id_docs = list(
+            self.doc_iterator(rank=rank, limit=limit, world_size=world_size)
+        )
+
+        num_docs = len(doc_id_docs)
+
+        for doc_id, doc in tqdm(
+            doc_id_docs,
+            total=num_docs,
+        ):
+            # sample fewshot context #TODO: need to offset doc_id by rank now!
+            fewshot_ctx = self.fewshot_context(
+                doc,
+                0 if self.config.num_fewshot is None else self.config.num_fewshot,
+                system_instruction,
+                apply_chat_template,
+                fewshot_as_multiturn,
+                chat_template,
+            )
+
+            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
+            inst = self.construct_requests(
+                doc=doc,
+                ctx=fewshot_ctx,
+                metadata=(self.config["task"], doc_id, self.config.repeats),
+            )
+
+            if not isinstance(inst, list):
+                inst = [inst]
+
+            instances.append(inst)
+
+        # now flatten, this is to allow slicing to work with pickles
+
+        sliced_instances = instances[:og_limit]
+
+        flattened_instances = [
+            instance
+            for instance_group in sliced_instances
+            for instance in instance_group
+        ]
+
+        self._instances = flattened_instances
+
+        if len(self._instances) == 0:
+            raise ValueError("task.build_requests() did not find any docs!")
+
+        if cache_requests and (not cached_instances or rewrite_requests_cache):
+            save_to_cache(file_name=cache_key, obj=instances)
+
+    @abc.abstractmethod
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        :param doc_idx: int
+            The index of a document within `self.test_docs()` or `self.validation_docs()`,
+            whichever is the main split used.
+        :param repeats: int
+        TODO: update this docstring
+            The number of times each instance in a dataset is inferred on. Defaults to 1,
+            can be increased for techniques like majority voting.
+        """
+        pass
+
+    @abc.abstractmethod
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        pass
+
+    @abc.abstractmethod
+    def aggregation(self):
+        """
+        :returns: {str: [metric_score] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metric scores
+        """
+        pass
+
+    @abc.abstractmethod
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        pass
+
+    def get_config(self, key: str) -> Any:
+        return getattr(self._config, key, None)
+
+    @classmethod
+    def count_bytes(cls, doc):
+        """Used for byte-level perplexity metrics in rolling loglikelihood"""
+        return len(doc.encode("utf-8"))
+
+    @classmethod
+    def count_words(cls, doc):
+        """Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))
+
+    @utils.positional_deprecated
+    def fewshot_context(
+        self,
+        doc,
+        num_fewshot,
+        rnd=None,
+        description=None,
+    ):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param rnd: random.Random
+            The pseudo-random number generator used to randomly sample examples.
+            WARNING: This is currently a required arg although it's optionalized with a default `None`.
+        :param description: str
+            The task's description that will be prepended to the fewshot examples.
+        :returns: str
+            The fewshot context.
+        """
+        if rnd is None:
+            if self.fewshot_rnd is not None:
+                rnd = self.fewshot_rnd
+            else:
+                raise ValueError(
+                    "A `random.Random` generator argument must be provided to `rnd`"
+                )
+
+        description = description if description else ""
+
+        if num_fewshot == 0:
+            labeled_examples = ""
+        else:
+            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
+            if self.has_training_docs():
+                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            else:
+                if self._fewshot_docs is None:
+                    self._fewshot_docs = list(
+                        self.validation_docs()
+                        if self.has_validation_docs()
+                        else self.test_docs()
+                    )
+
+                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+
+                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+
+            labeled_examples = (
+                "\n\n".join(
+                    [
+                        self.doc_to_text(doc) + self.doc_to_target(doc)
+                        for doc in fewshotex
+                    ]
+                )
+                + "\n\n"
+            )
+
+        example = self.doc_to_text(doc)
+        return description + labeled_examples + example
+
+    def apply_filters(self) -> Optional[List[Instance]]:
+        """Iterates over FilterEnsembles and applies them to instances"""
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+
+    def dump_config(self) -> dict:
+        """Returns the config as a dictionary."""
+        # TODO: this should only return the overrides applied to a non-YAML task's configuration.
+        # (num_fewshot)
+        return self.config.to_dict()
+
+    def set_config(self, key: str, value: Any, update: bool = False) -> None:
+        """Set or update the configuration for a given key."""
+        if key is None:
+            raise ValueError("Key must be provided.")
+
+        if update:
+            current_value = getattr(self._config, key, {})
+            if not isinstance(current_value, dict):
+                raise TypeError(
+                    f"Expected a dict for key '{key}', got {type(current_value).__name__} instead."
+                )
+            current_value.update(value)
+        else:
+            setattr(self._config, key, value)
+
+    def override_metric(self, metric_name: str) -> None:
+        """
+        Override the default metrics used for evaluation with custom metrics.
+
+        Parameters:
+        - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
+        """
+        (
+            self._metric_fn_list,
+            self._aggregation_list,
+            self._metric_fn_kwargs,
+            self._higher_is_better,
+        ) = ({}, {}, {}, {})
+        self._metric_fn_list[metric_name] = get_metric(metric_name)
+        self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
+        self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        self._metric_fn_kwargs[metric_name] = {}
+        if not isinstance(self, ConfigurableTask):
+            self.process_results = lambda x, y: {metric_name: get_metric(metric_name)}
+            self.aggregation = lambda: {
+                metric_name: get_metric_aggregation(metric_name)
+            }
+        setattr(self._config, "metric_list", [{"metric": metric_name}])
+        setattr(self._config, "process_results", None)
+
+    def set_fewshot_seed(self, seed: Optional[int] = None) -> None:
+        self.fewshot_rnd = random.Random(seed)
+        if hasattr(self, "sampler"):
+            self.sampler.rnd = self.fewshot_rnd
+
+    @property
+    def eval_docs(self) -> Union[datasets.Dataset, List[dict]]:
+        if self.has_test_docs():
+            return self.test_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            raise ValueError(
+                f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+            )
+
+    def doc_iterator(
+        self, *, rank: int = 0, limit: Union[int, None] = None, world_size: int = 1
+    ) -> Iterator[Tuple[int, Any]]:
+        limit = int(limit) if limit else None
+        doc_iterator = utils.create_iterator(
+            enumerate(self.eval_docs),
+            rank=int(rank),
+            limit=limit,
+            world_size=int(world_size),
+        )
+        return doc_iterator
+
+
+class ConfigurableTask(Task):
+    VERSION = "Yaml"
+    OUTPUT_TYPE = None
+    CONFIG = None
+
+    def __init__(
+        self,
+        data_dir=None,
+        cache_dir=None,
+        download_mode=None,
+        config: Optional[dict] = None,
+    ) -> None:  # TODO no super() call here
+        # Get pre-configured attributes
+        self._config = self.CONFIG
+
+        # Use new configurations if there was no preconfiguration
+        if self.config is None:
+            self._config = TaskConfig(**config)
+        # Overwrite configs
+        else:
+            if config is not None:
+                self._config.__dict__.update(config)
+
+        if self.config is None:
+            raise ValueError(
+                "Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg"
+            )
+
+        if isinstance(self.config.metadata, dict):
+            if "version" in self.config.metadata:
+                self.VERSION = self.config.metadata["version"]
+
+        if self.config.output_type is not None:
+            if self.config.output_type not in ALL_OUTPUT_TYPES:
+                raise ValueError(
+                    f"Got invalid output_type '{self.config.output_type}', must be in '{','.join(ALL_OUTPUT_TYPES)}'"
+                )
+            self.OUTPUT_TYPE = self.config.output_type
+
+        if self.config.dataset_path is not None:
+            self.DATASET_PATH = self.config.dataset_path
+
+        if self.config.dataset_name is not None:
+            self.DATASET_NAME = self.config.dataset_name
+
+        self._metric_fn_list = {}
+        self._metric_fn_kwargs = {}
+        self._aggregation_list = {}
+        self._higher_is_better = {}
+
+        if self.config.metric_list is None:
+            # TODO: handle this in TaskConfig.__post_init__ ?
+            _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
+
+            for metric_name in _metric_list:
+                self._metric_fn_list[metric_name] = get_metric(metric_name)
+                self._metric_fn_kwargs[metric_name] = {}
+                self._aggregation_list[metric_name] = get_metric_aggregation(
+                    metric_name
+                )
+                self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        else:
+            for metric_config in self.config.metric_list:
+                if "metric" not in metric_config:
+                    raise ValueError(
+                        "'metric' key not provided for an entry in 'metric_list', must be specified!"
+                    )
+                metric_name = metric_config["metric"]
+                kwargs = {
+                    key: metric_config[key]
+                    for key in metric_config
+                    if key
+                    not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
+                }
+                hf_evaluate_metric = (
+                    "hf_evaluate" in metric_config
+                    and metric_config["hf_evaluate"] is True
+                )
+
+                if self.config.process_results is not None:
+                    self._metric_fn_list[metric_name] = None
+                    self._metric_fn_kwargs[metric_name] = {}
+                elif callable(metric_name):
+                    metric_fn = metric_name.__call__
+                    metric_name = metric_name.__name__
+                    self._metric_fn_list[metric_name] = metric_fn
+                    self._metric_fn_kwargs[metric_name] = kwargs
+                else:
+                    self._metric_fn_list[metric_name] = get_metric(
+                        metric_name, hf_evaluate_metric
+                    )
+                    self._metric_fn_kwargs[metric_name] = kwargs
+
+                if "aggregation" in metric_config:
+                    agg_name = metric_config["aggregation"]
+                    if isinstance(agg_name, str):
+                        self._aggregation_list[metric_name] = get_aggregation(agg_name)
+                    elif callable(agg_name):  # noqa: E721
+                        self._aggregation_list[metric_name] = metric_config[
+                            "aggregation"
+                        ]
+                else:
+                    INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
+                    metric_agg = get_metric_aggregation(metric_name)
+                    eval_logger.warning(
+                        f"[Task: {self.config.task}] metric {metric_name} is defined, but aggregation is not. "
+                        f"using default "
+                        f"aggregation={INV_AGG_REGISTRY[metric_agg]}"
+                    )
+                    self._aggregation_list[metric_name] = metric_agg
+
+                if "higher_is_better" in metric_config:
+                    self._higher_is_better[metric_name] = metric_config[
+                        "higher_is_better"
+                    ]
+                else:
+                    eval_logger.warning(
+                        f"[Task: {self.config.task}] metric {metric_name} is defined, but higher_is_better is not. "
+                        f"using default "
+                        f"higher_is_better={is_higher_better(metric_name)}"
+                    )
+                    self._higher_is_better[metric_name] = is_higher_better(metric_name)
+
+        self.download(self.config.dataset_kwargs)
+        self._training_docs = None
+        self._fewshot_docs = None
+
+        if self.config.filter_list is not None:
+            self._filters = []
+            for filter_config in self.config.filter_list:
+                filter_name = filter_config["name"]
+                filter_functions = filter_config["filter"]
+                components = []
+                for function in filter_functions:
+                    kwargs = {
+                        key: function[key] for key in function if key != "function"
+                    }
+                    components.append([function["function"], kwargs])
+                filter_pipeline = build_filter_ensemble(filter_name, components)
+                self._filters.append(filter_pipeline)
+        else:
+            self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+
+        if self.config.use_prompt is not None:
+            eval_logger.info(f"loading prompt {self.config.use_prompt}")
+            self.prompt = get_prompt(
+                self.config.use_prompt, self.DATASET_PATH, self.DATASET_NAME
+            )
+        else:
+            self.prompt = None
+
+        if self.fewshot_docs() is not None:
+            self.fewshot_rnd = (
+                random.Random()
+            )  # setting with no seed, to be overridden at a later time
+            config_sampler: Union[str, Callable] = (
+                self.config.fewshot_config.get("sampler", "default")
+                if self.config.fewshot_config
+                else "default"
+            )
+            if isinstance(config_sampler, str):
+                self.sampler = samplers.get_sampler(config_sampler)(
+                    list(self.fewshot_docs()), self, rnd=self.fewshot_rnd
+                )
+            elif callable(config_sampler) and issubclass(
+                config_sampler, samplers.ContextSampler
+            ):
+                self.sampler = config_sampler(
+                    docs=list(self.fewshot_docs()), task=self, rnd=self.fewshot_rnd
+                )
+            else:
+                raise TypeError(
+                    f"fewshot_config.sampler should be a string or callable of ContextSampler type, "
+                    f"not {type(config_sampler)}"
+                )
+
+        self.task_docs = self.eval_docs
+
+        # Test One Doc
+        self.features = list(self.task_docs.features.keys())
+        self.multiple_input = 0
+        self.multiple_target = 0
+        test_doc = self.task_docs[0]
+        test_text = self.doc_to_text(test_doc)
+        test_target = self.doc_to_target(test_doc)
+
+        if self.config.doc_to_choice is not None:
+            test_choice = self.doc_to_choice(test_doc)
+            if not isinstance(test_choice, list):
+                eval_logger.error("doc_to_choice must return list")
+            else:
+                num_choice = len(test_choice)
+
+            if isinstance(test_text, int):
+                self.multiple_input = num_choice
+        else:
+            test_choice = None
+
+        if isinstance(test_target, list):
+            self.multiple_target = len(test_target)
+        else:
+            if (isinstance(test_target, int)) and (test_choice is not None):
+                test_target = test_choice[test_target]
+            else:
+                test_target = str(test_target)
+
+        if test_choice is not None:
+            check_choices = test_choice
+        else:
+            check_choices = [test_target]
+        if self.config.doc_to_choice is not None:
+            for choice in check_choices:
+                choice_has_whitespace = True if choice[0].isspace() else False
+                delimiter_has_whitespace = (
+                    True
+                    if self.config.target_delimiter.rstrip()
+                    != self.config.target_delimiter
+                    else False
+                )
+
+                if delimiter_has_whitespace and choice_has_whitespace:
+                    eval_logger.debug(
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" have whitespace'
+                    )
+                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
+                    eval_logger.debug(
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+                    )
+
+    def download(self, dataset_kwargs: Optional[Dict[str, Any]] = None) -> None:
+        self.dataset = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            **dataset_kwargs if dataset_kwargs is not None else {},
+        )
+
+    def has_training_docs(self) -> bool:
+        if self.config.training_split is not None:
+            return True
+        else:
+            return False
+
+    def has_validation_docs(self) -> bool:
+        if self.config.validation_split is not None:
+            return True
+        else:
+            return False
+
+    def has_test_docs(self) -> bool:
+        if self.config.test_split is not None:
+            return True
+        else:
+            return False
+
+    def training_docs(self) -> datasets.Dataset:
+        if self.has_training_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.training_split]
+                )
+            return self.dataset[self.config.training_split]
+
+    def validation_docs(self) -> datasets.Dataset:
+        if self.has_validation_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.validation_split]
+                )
+            return self.dataset[self.config.validation_split]
+
+    def test_docs(self) -> datasets.Dataset:
+        if self.has_test_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.test_split])
+            return self.dataset[self.config.test_split]
+
+    def fewshot_docs(self):
+        if self.config.fewshot_split is not None:
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.fewshot_split])
+            return self.dataset[self.config.fewshot_split]
+        elif (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("samples", None) is not None
+        ):
+            if isinstance(self.config.fewshot_config["samples"], list):
+                return self.config.fewshot_config["samples"]
+            elif callable(self.config.fewshot_config["samples"]):
+                return self.config.fewshot_config["samples"]()
+            else:
+                raise Exception(
+                    "`fewshot_config['samples']` was incorrectly defined in the configuration. It should be either a list of samples as a dict, or function returning this list."
+                )
+        else:
+            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
+                eval_logger.warning(
+                    f"[Task: {self.config.task}] "
+                    "num_fewshot > 0 but fewshot_split is None. "
+                    "using preconfigured rule."
+                )
+            return super().fewshot_docs()
+
+    @staticmethod
+    def append_target_question(
+        labeled_examples: List[Dict[str, str]],
+        question: str,
+        fewshot_as_multiturn: bool = False,
+    ) -> None:
+        """Adds a target question to the labeled examples list.
+        If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
+        Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
+        """
+        if not fewshot_as_multiturn:
+            # if no messages or last message is system, append as new user entry
+            if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
+                labeled_examples.append({"role": "user", "content": question})
+            # if last message is user, append to it to avoid two user messages in a row
+            else:
+                labeled_examples[-1]["content"] += question
+        else:
+            # if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
+            labeled_examples.append({"role": "user", "content": question})
+
+    @utils.positional_deprecated
+    def fewshot_context(
+        self,
+        doc: str,
+        num_fewshot: int,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+    ) -> str:
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param  system_instruction: str
+            System instruction to be applied to the prompt.
+        :param apply_chat_template: bool
+            Whether to apply the chat template to the fewshot context.
+        :param fewshot_as_multiturn: bool
+            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+        :param chat_template: Callable
+            Chat template to be applied to the fewshot context.
+        :returns: str
+            The fewshot context.
+        """
+
+        if apply_chat_template:
+            labeled_examples = []
+        else:
+            labeled_examples = ""
+
+        # get task description
+        if description := self.config.description:
+            description = utils.apply_template(self.config.description, doc)
+
+        # create system prompt based on the provided system instruction and description
+        if system_instruction is not None and description:
+            system_prompt = (
+                f"{system_instruction}{self.sampler.fewshot_delimiter}{description}"
+            )
+        elif system_instruction is not None:
+            system_prompt = system_instruction
+        elif description:
+            system_prompt = description
+        else:
+            system_prompt = ""
+
+        # add system prompt if specified
+        if system_prompt:
+            if apply_chat_template:
+                labeled_examples.append({"role": "system", "content": system_prompt})
+            else:
+                labeled_examples = system_prompt
+
+        # if few-shot - append examples after the system prompt
+        if num_fewshot > 0:
+            if apply_chat_template:
+                labeled_examples.extend(
+                    self.sampler.get_chat_context(
+                        doc, num_fewshot, fewshot_as_multiturn
+                    )
+                )
+            else:
+                labeled_examples += self.sampler.get_context(doc, num_fewshot)
+
+        example = self.doc_to_text(doc)
+        if apply_chat_template:
+            if self.multiple_input:
+                return chat_template(labeled_examples)
+            if isinstance(example, str):
+                self.append_target_question(
+                    labeled_examples, example, fewshot_as_multiturn
+                )
+            # for loglikelihood create a list of questions with appended choices
+            elif isinstance(example, list):
+                labeled_examples_list = []
+                # copy chat history for each example and append the answer
+                for ex in example:
+                    chat = deepcopy(labeled_examples)
+                    self.append_target_question(chat, ex, fewshot_as_multiturn)
+                    labeled_examples_list.append(chat_template(chat))
+                return labeled_examples_list
+            # if example is an integer, append the choice or convert to string
+            elif isinstance(example, int):
+                if self.config.doc_to_choice is not None:
+                    choices = self.doc_to_choice(doc)
+                    self.append_target_question(
+                        labeled_examples, choices[example], fewshot_as_multiturn
+                    )
+                else:
+                    self.append_target_question(
+                        labeled_examples, str(example), fewshot_as_multiturn
+                    )
+                # return lm.apply_chat_template(labeled_examples)
+            return chat_template(labeled_examples)
+        else:
+            if self.multiple_input:
+                return labeled_examples
+            if isinstance(example, str):
+                return labeled_examples + example
+            elif isinstance(example, list):
+                return [labeled_examples + ex for ex in example]
+            elif isinstance(example, int):
+                if self.config.doc_to_choice is not None:
+                    choices = self.doc_to_choice(doc)
+                    return labeled_examples + choices[example]
+                else:
+                    return labeled_examples + str(example)
+
+    def apply_filters(self):
+        """Iterates over FilterEnsembles and applies them to instances"""
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+
+    def should_decontaminate(self):
+        return self.config.should_decontaminate
+
+    def doc_to_decontamination_query(self, doc):
+        if self.config.should_decontaminate:
+            if self.config.doc_to_decontamination_query is None:
+                return self.doc_to_text(doc)
+            else:
+                doc_to_decontamination_query = self.config.doc_to_decontamination_query
+                if doc_to_decontamination_query in self.features:
+                    return doc[doc_to_decontamination_query]
+                elif callable(doc_to_decontamination_query):
+                    return doc_to_decontamination_query(doc)
+                else:
+                    return ast.literal_eval(
+                        utils.apply_template(
+                            self.config.doc_to_decontamination_query, doc
+                        )
+                    )
+
+    def _process_doc(self, doc: dict) -> dict:
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+
+    def doc_to_text(self, doc, doc_to_text=None):
+        if self.prompt is not None:
+            doc_to_text = self.prompt
+        elif doc_to_text is not None:
+            doc_to_text = doc_to_text
+        else:
+            doc_to_text = self.config.doc_to_text
+
+        if isinstance(doc_to_text, int):
+            return doc_to_text
+        elif isinstance(doc_to_text, str):
+            if doc_to_text in self.features:
+                # if self.config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
+                # else:
+                return doc[doc_to_text]
+            else:
+                text_string = utils.apply_template(doc_to_text, doc)
+                if text_string.isdigit() and self._config.doc_to_choice is not None:
+                    return ast.literal_eval(text_string)
+                else:
+                    return text_string
+        elif callable(doc_to_text):
+            return doc_to_text(doc)
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_text, "apply"):
+            applied_prompt = doc_to_text.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[0]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self.config.fewshot_delimiter
+        else:
+            print(type(doc_to_text))
+            raise TypeError
+
+    def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
+        if self.prompt is not None:
+            doc_to_target = self.prompt
+        elif doc_to_target is not None:
+            doc_to_target = doc_to_target
+        else:
+            doc_to_target = self.config.doc_to_target
+
+        if isinstance(doc_to_target, int):
+            return doc_to_target
+        elif isinstance(doc_to_target, str):
+            if doc_to_target in self.features:
+                # if self.config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
+                # else:
+                return doc[doc_to_target]
+            else:
+                target_string = utils.apply_template(doc_to_target, doc)
+                if target_string.isdigit() and self._config.doc_to_choice is not None:
+                    return ast.literal_eval(target_string)
+                elif (
+                    len(target_string) >= 2
+                    and (target_string[0] == "[")
+                    and (target_string[-1] == "]")
+                ):
+                    try:
+                        return ast.literal_eval(target_string)
+                    except (SyntaxError, ValueError):
+                        return target_string
+                else:
+                    return target_string
+        elif isinstance(doc_to_target, list):
+            return doc_to_target
+        elif callable(doc_to_target):
+            return doc_to_target(doc)
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_target, "apply"):
+            applied_prompt = doc_to_target.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[1]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self.config.fewshot_delimiter
+        else:
+            raise TypeError
+
+    def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
+        if self.prompt is not None:
+            doc_to_choice = self.prompt
+        elif doc_to_choice is not None:
+            doc_to_choice = doc_to_choice
+        elif self.config.doc_to_choice is None:
+            eval_logger.error("doc_to_choice was called but not set in config")
+        else:
+            doc_to_choice = self.config.doc_to_choice
+
+        if isinstance(doc_to_choice, str):
+            if doc_to_choice in self.features:
+                return doc[doc_to_choice]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
+        elif isinstance(doc_to_choice, list):
+            return doc_to_choice
+        elif isinstance(doc_to_choice, dict):
+            return list(doc_to_choice.values())
+        elif callable(doc_to_choice):
+            return doc_to_choice(doc)
+        elif hasattr(doc_to_choice, "get_answer_choices_list"):
+            return doc_to_choice.get_answer_choices_list(doc)
+        else:
+            raise TypeError
+
+    def construct_requests(
+        self, doc: dict, ctx: str, **kwargs
+    ) -> Union[List[Instance], Instance]:
+        if self.OUTPUT_TYPE == "loglikelihood":
+            arguments = (ctx, self.doc_to_target(doc))
+        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
+            arguments = (self.doc_to_target(doc),)
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            choices = self.doc_to_choice(doc)
+            target_delimiter = self.config.target_delimiter
+            if self.multiple_input:
+                # If there are multiple inputs, choices are placed in the ctx
+                cont = self.doc_to_target(doc)
+                arguments = [
+                    (ctx + choice, f"{target_delimiter}{cont}") for choice in choices
+                ]
+            else:
+                # Otherwise they are placed in the continuation
+                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
+
+            request_list = [
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=arg,
+                    idx=i,
+                    **kwargs,
+                )
+                for i, arg in enumerate(arguments)
+            ]
+            # TODO: we should raise a warning telling users this will at most ~2x runtime.
+            if "acc_mutual_info" in self._metric_fn_list.keys():
+                # if we are calculating multiple choice accuracy
+                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
+
+                # here mutual info refers to calculating
+                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
+                # in other words normalizing by subtracting the unconditional logprob of each choice.
+                request_list.extend(
+                    [
+                        Instance(
+                            request_type="loglikelihood",
+                            doc=doc,
+                            arguments=("", "{}".format(choice)),
+                            idx=i,
+                            **kwargs,
+                        )
+                        for i, choice in enumerate(choices)
+                    ]
+                )
+            return request_list
+
+        elif self.OUTPUT_TYPE == "generate_until":
+            arguments = (ctx, deepcopy(self.config.generation_kwargs))
+
+        return Instance(
+            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
+        )
+
+    def process_results(self, doc, results):
+        if callable(self.config.process_results):
+            return self.config.process_results(doc, results)
+
+        result_dict = {}
+        use_metric = list(self._metric_fn_list.keys())
+        if self.OUTPUT_TYPE == "loglikelihood":
+            results = results[0]
+            ll, is_greedy = results
+            return {
+                **({"perplexity": ll} if "perplexity" in use_metric else {}),
+                **({"acc": int(is_greedy)} if "acc" in use_metric else {}),
+            }
+        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
+            (loglikelihood,) = results
+            _words = self.count_words(self.doc_to_target(doc))
+            _bytes = self.count_bytes(self.doc_to_target(doc))
+            return {
+                **(
+                    {"word_perplexity": (loglikelihood, _words)}
+                    if "word_perplexity" in use_metric
+                    else {}
+                ),
+                **(
+                    {"byte_perplexity": (loglikelihood, _bytes)}
+                    if "byte_perplexity" in use_metric
+                    else {}
+                ),
+                **(
+                    {"bits_per_byte": (loglikelihood, _bytes)}
+                    if "bits_per_byte" in use_metric
+                    else {}
+                ),
+            }
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            lls, is_greedy = zip(*results)
+
+            # retrieve choices in List[str] form, to compute choice lengths, etc.
+            choices = self.doc_to_choice(doc)
+            completion_len = np.array([float(len(i)) for i in choices])
+
+            if (
+                2 * len(choices) == len(lls)
+                and "acc_mutual_info" in self._metric_fn_list.keys()
+            ):
+                # then we are doing mutual info.
+                # this stores the "dryrun" / unconditional answer loglikelihoods
+                lls_unconditional = lls[1::2]
+                if len(lls_unconditional) != len(choices):
+                    raise ValueError
+                # and this stores our "regular" conditional loglikelihoods
+                lls = lls[::2]
+
+            pred = np.argmax(lls)
+            pred_norm = np.argmax(lls / completion_len)
+
+            if self.multiple_input:
+                gold = self.doc_to_text(doc)
+            else:
+                gold = self.doc_to_target(doc)
+
+            gold_index_error = False
+            if isinstance(gold, list):
+                gold = [i if i < len(choices) else -100 for i in gold]
+                if -100 in gold:
+                    gold_index_error = True
+            else:
+                if isinstance(gold, int):
+                    gold = gold if gold < len(choices) else -100
+                elif isinstance(gold, str):
+                    gold = choices.index(gold) if gold in choices else -100
+
+                if gold == -100:
+                    gold_index_error = True
+
+            if gold_index_error:
+                eval_logger.warning(
+                    f"Label index was not in within range of available choices,"
+                    f"Sample:\n\n{doc}\n\n"
+                )
+
+            if self.multiple_target:
+                acc = 1.0 if pred in gold else 0.0
+                acc_norm = 1.0 if pred_norm in gold else 0.0
+                exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
+            else:
+                acc = 1.0 if pred == gold else 0.0
+                acc_norm = 1.0 if pred_norm == gold else 0.0
+                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
+                exact_match = int(is_greedy[gold]) if gold != -100 else 0
+
+            prob_norm = utils.softmax(lls)
+
+            # TODO use keyword arguments to the metric?
+            # gold, pred, norm stuff, the original lls,
+            result_dict = {
+                **({"acc": acc} if "acc" in use_metric else {}),
+                **({"f1": (gold, pred)} if "f1" in use_metric else {}),
+                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
+                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
+                **(
+                    {"brier_score": (gold, prob_norm)}
+                    if "brier_score" in use_metric
+                    else {}
+                ),
+            }
+
+            if "acc_mutual_info" in use_metric:
+                lls_mutual_info = [
+                    ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)
+                ]
+                acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
+                result_dict["acc_mutual_info"] = acc_mutual_info
+
+        elif self.OUTPUT_TYPE == "generate_until":
+            gold = self.doc_to_target(doc)
+            result = results[0]
+            if self.config.doc_to_choice is not None:
+                # If you set doc_to_choice,
+                # it assumes that doc_to_target returns a number.
+                choices = self.doc_to_choice(doc)
+                gold = choices[gold]
+            # we expect multiple_targets to be a list.
+            elif self.multiple_target:
+                gold = list(gold)
+            elif type(gold) != type(result):
+                # cast gold to the same type as result
+                gold = type(result)(gold)
+
+            for metric in self._metric_fn_list.keys():
+                if self.multiple_target:
+                    # in the case where we have multiple targets,
+                    # return true if any are true
+                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
+                    scores = []
+                    if not isinstance(gold, list):
+                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
+                        # print(gold)
+                        gold = [gold]
+                    if metric == "exact_match":
+                        result = [result for _ in range(len(gold))]
+                        scores = self._metric_fn_list[metric](
+                            references=gold,
+                            predictions=result,
+                            **self._metric_fn_kwargs[metric],
+                        )[metric]
+                        result_score = 1.0 if scores > 0.0 else 0.0
+                    else:
+                        for gold_option in gold:
+                            try:
+                                result_score = self._metric_fn_list[metric](
+                                    references=[gold_option],
+                                    predictions=[result],
+                                    **self._metric_fn_kwargs[metric],
+                                )
+                            except (
+                                TypeError
+                            ):  # TODO: this is hacky and I don't want to do it
+                                result_score = self._metric_fn_list[metric](
+                                    [gold_option, result]
+                                )
+                            if isinstance(result_score, dict):
+                                # TODO: this handles the case where HF evaluate returns a dict.
+                                result_score = result_score[metric]
+                            scores.append(result_score)
+                        if any(scores):
+                            result_score = 1.0
+                        else:
+                            result_score = 0.0
+                else:
+                    try:
+                        result_score = self._metric_fn_list[metric](
+                            references=[gold],
+                            predictions=[result],
+                            **self._metric_fn_kwargs[metric],
+                        )
+                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                        result_score = self._metric_fn_list[metric]([gold, result])
+                    if isinstance(result_score, dict):
+                        # TODO: this handles the case where HF evaluate returns a dict.
+                        result_score = result_score[metric]
+                result_dict[metric] = result_score
+        else:
+            raise ValueError(
+                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
+                "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'",
+            )
+
+        return result_dict
+
+    def aggregation(self) -> dict:
+        return self._aggregation_list
+
+    def higher_is_better(self) -> dict:
+        return self._higher_is_better
+
+    def get_config(self, key: str) -> Any:
+        return getattr(self._config, key, None)
+
+    @property
+    def task_name(self) -> Any:
+        return getattr(self.config, "task", None)
+
+    def __repr__(self):
+        return (
+            f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
+            f"output_type={self.OUTPUT_TYPE},"
+            f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
+            f"num_samples={len(self.eval_docs)})"
+        )
+
+
+class MultipleChoiceTask(Task):
+    OUTPUT_TYPE = "loglikelihood"
+
+    def doc_to_target(self, doc: dict) -> str:
+        return " " + doc["choices"][doc["gold"]]
+
+    def construct_requests(self, doc: dict, ctx: str, **kwargs) -> List[Instance]:
+        # TODO: add mutual info here?
+        return [
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " {}".format(choice)),
+                idx=i,
+                **kwargs,
+            )
+            for i, choice in enumerate(doc["choices"])
+        ]
+
+    def process_results(self, doc: dict, results: Iterable[Tuple[float, bool]]) -> dict:
+        results = [
+            res[0] for res in results
+        ]  # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere?
+        gold = doc["gold"]
+
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+        }
+
+    def higher_is_better(self) -> dict:
+        return {
+            "acc": True,
+            "acc_norm": True,
+        }
+
+    def aggregation(self) -> dict:
+        return {
+            "acc": mean,
+            "acc_norm": mean,
+        }
+
+
+class PerplexityTask(Task):
+    OUTPUT_TYPE = "loglikelihood_rolling"
+
+    def has_training_docs(self) -> bool:
+        return False
+
+    def fewshot_examples(self, k: int, rnd) -> List:
+        if k != 0:
+            raise ValueError(
+                "The number of fewshot examples must be 0 for perplexity tasks."
+            )
+        return []
+
+    def fewshot_context(self, doc: dict, num_fewshot: int) -> Literal[""]:
+        if num_fewshot != 0:
+            raise ValueError(
+                "The number of fewshot examples must be 0 for perplexity tasks."
+            )
+
+        return ""
+
+    def higher_is_better(self) -> dict:
+        return {
+            "word_perplexity": False,
+            "byte_perplexity": False,
+            "bits_per_byte": False,
+        }
+
+    def doc_to_decontamination_query(self, doc):
+        return doc
+
+    def doc_to_text(self, doc) -> str:
+        return ""
+
+    def doc_to_target(self, doc):
+        return doc
+
+    def construct_requests(self, doc: dict, ctx: Optional[str], **kwargs):
+        if bool(ctx):
+            raise ValueError
+
+        return Instance(
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=(self.doc_to_target(doc),),
+            idx=0,
+            **kwargs,
+        )
+
+    def process_results(self, doc: dict, results: Tuple[float]) -> dict:
+        (loglikelihood,) = results
+        words = self.count_words(self.doc_to_target(doc))
+        bytes_ = self.count_bytes(self.doc_to_target(doc))
+        return {
+            "word_perplexity": (loglikelihood, words),
+            "byte_perplexity": (loglikelihood, bytes_),
+            "bits_per_byte": (loglikelihood, bytes_),
+        }
+
+    def aggregation(self) -> dict:
+        return {
+            "word_perplexity": weighted_perplexity,
+            "byte_perplexity": weighted_perplexity,
+            "bits_per_byte": bits_per_byte,
+        }
+
+    @classmethod
+    def count_bytes(cls, doc) -> int:
+        return len(doc.encode("utf-8"))
+
+    @classmethod
+    def count_words(cls, doc) -> int:
+        """Downstream tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__init__.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b2441eb878de8d3b58af798ba9f19cda6f82d19
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/__init__.py
@@ -0,0 +1,28 @@
+from . import (
+    anthropic_llms,
+    api_models,
+    dummy,
+    gguf,
+    huggingface,
+    mamba_lm,
+    nemo_lm,
+    neuralmagic,
+    neuron_optimum,
+    openai_completions,
+    optimum_lm,
+    textsynth,
+    vllm_causallms,
+)
+
+
+# TODO: implement __all__
+
+
+try:
+    # enable hf hub transfer if available
+    import hf_transfer  # type: ignore # noqa
+    import huggingface_hub.constants  # type: ignore
+
+    huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+except ImportError:
+    pass
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11e2b91028f60e85447c7a95158bf995100435ac
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/anthropic_llms.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/anthropic_llms.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f678aba7d0f7a425b5129bae2443a628c82b41d
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/anthropic_llms.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/api_models.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/api_models.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..afec2e2a31d51bba5822f01cd77a06a87f87544d
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/api_models.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e6faeb301623eeccca376dc77cd995a34c35b1d
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/gguf.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/gguf.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7604b9c1525cb468992dfaad65c9c783ea52210d
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/gguf.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93e500d2eca76e1207d1a00726006e8da98ef914
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/mamba_lm.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/mamba_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..084b4cf68cd4e4574e9d0beb6a5e5c383ec27295
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/mamba_lm.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/nemo_lm.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/nemo_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5c2efce3c332b0c1a66434711c1367d5e6c5354
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/nemo_lm.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuralmagic.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuralmagic.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e258ef4e21ce22dd3ef7276fe98d0dd0e6d187f0
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuralmagic.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuron_optimum.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuron_optimum.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d01089447ff94a5c9e418f77c5ba76ad25f74d70
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/neuron_optimum.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/openai_completions.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/openai_completions.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0d05f33713787ccf829864778e927524dfbf6ce
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/openai_completions.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/optimum_lm.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/optimum_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2fff494548b18c41a3aeba826bb86f699c1f5fef
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/optimum_lm.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ff02eda48776144fc5b4c3a52488b985e92e06e
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/utils.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfa43f9455d6cc0538661f3e5a8d8424540ff52c
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/utils.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/vllm_causallms.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/vllm_causallms.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d712a903aaefe0cea9eb9e7e8cbdc3dc7852851b
Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/models/__pycache__/vllm_causallms.cpython-310.pyc differ
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/anthropic_llms.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/anthropic_llms.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b22b6a979ca12f6a68af7a16e3c50a8ad233ddf
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/anthropic_llms.py
@@ -0,0 +1,362 @@
+import os
+from functools import cached_property
+from typing import Any, Dict, List, Tuple, Union
+
+from tqdm import tqdm
+
+from lm_eval import utils
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.openai_completions import LocalCompletionsAPI
+from lm_eval.models.utils import retry_on_specific_exceptions
+
+
+eval_logger = utils.eval_logger
+
+
+def anthropic_completion(
+    client,  #: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens_to_sample: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
+) -> str:
+    """Wrapper function around the Anthropic completion API client with exponential back-off
+    in case of RateLimitError.
+
+    params:
+        client: anthropic.Anthropic
+            Anthropic API client
+        model: str
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        prompt: str
+            Prompt to feed to the model
+        max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        temperature: float
+            Sampling temperature
+        stop: List[str]
+            List of stop sequences
+        kwargs: Any
+            Additional model_args to pass to the API client
+    """
+
+    try:
+        import anthropic
+    except ModuleNotFoundError:
+        raise Exception(
+            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+        )
+
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[anthropic.RateLimitError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        response = client.completions.create(
+            prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
+            model=model,
+            # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
+            #       (e.g. gsm8k's ":") may truncate a lot of the input.
+            stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
+            max_tokens_to_sample=max_tokens_to_sample,
+            temperature=temperature,
+            **kwargs,
+        )
+        return response.completion
+
+    return completion()
+
+
+def anthropic_chat(
+    client,  #: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
+) -> str:
+    """Wrapper function around the Anthropic completion API client with exponential back-off
+    in case of RateLimitError.
+
+    params:
+        client: anthropic.Anthropic
+            Anthropic API client
+        model: str
+            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
+        prompt: str
+            Prompt to feed to the model
+        max_tokens: int
+            Maximum number of tokens to sample from the model
+        temperature: float
+            Sampling temperature
+        stop: List[str]
+            List of stop sequences
+        kwargs: Any
+            Additional model_args to pass to the API client
+    """
+
+    try:
+        import anthropic
+    except ModuleNotFoundError:
+        raise Exception(
+            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+        )
+
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[
+            anthropic.RateLimitError,
+            anthropic.APIConnectionError,
+            anthropic.APIStatusError,
+        ],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def messages():
+        response = client.messages.create(
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[{"role": "user", "content": f"{prompt}"}],
+            **kwargs,
+        )
+        return response.content[0].text
+
+    return messages()
+
+
+@register_model("anthropic-completions")
+class AnthropicLM(LM):
+    REQ_CHUNK_SIZE = 20  # TODO: not used
+
+    def __init__(
+        self,
+        batch_size: int = 1,
+        model: str = "claude-2.0",
+        max_tokens_to_sample: int = 256,
+        temperature: float = 0,  # defaults to 1
+        **kwargs,  # top_p, top_k, etc.
+    ) -> None:
+        """Anthropic API wrapper.
+
+        :param model: str
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        :param max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        :param temperature: float
+            Sampling temperature
+        :param kwargs: Any
+            Additional model_args to pass to the API client
+        """
+        super().__init__()
+
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+
+        self.model = model
+        # defaults to os.environ.get("ANTHROPIC_API_KEY")
+        self.client = anthropic.Anthropic()
+        self.temperature = temperature
+        self.max_tokens_to_sample = max_tokens_to_sample
+        self.tokenizer = self.client.get_tokenizer()
+        self.kwargs = kwargs
+
+    @property
+    def eot_token_id(self):
+        # Not sure but anthropic.HUMAN_PROMPT ?
+        raise NotImplementedError("No idea about anthropic tokenization.")
+
+    @property
+    def max_length(self) -> int:
+        return 2048
+
+    @property
+    def max_gen_toks(self) -> int:
+        return self.max_tokens_to_sample
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string).ids
+
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
+            )
+
+        if not requests:
+            return []
+
+        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
+
+        res = []
+        for request in tqdm(_requests, disable=disable_tqdm):
+            try:
+                inp = request[0]
+                request_args = request[1]
+                # generation_kwargs
+                until = request_args.get("until")
+                max_gen_toks = request_args.get("max_gen_toks", self.max_length)
+                temperature = request_args.get("temperature", self.temperature)
+                response = anthropic_completion(
+                    client=self.client,
+                    model=self.model,
+                    prompt=inp,
+                    max_tokens_to_sample=max_gen_toks,
+                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
+                    stop=until,  # type: ignore
+                    **self.kwargs,
+                )
+                res.append(response)
+
+                self.cache_hook.add_partial("generate_until", request, response)
+            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"Server unreachable: {e.__cause__}")
+                break
+            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
+                eval_logger.critical(f"API error {e.status_code}: {e.message}")
+                break
+
+        return res
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
+
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+
+@register_model("anthropic-chat", "anthropic-chat-completions")
+class AnthropicChat(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url="https://api.anthropic.com/v1/messages",
+        tokenizer_backend=None,
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+        eval_logger.warning(
+            "Chat completions does not support batching. Defaulting to batch size 1."
+        )
+        self._batch_size = 1
+        self.anthropic_version = "2023-06-01"
+        eval_logger.warning(
+            f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
+        )
+
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("ANTHROPIC_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the ANTHROPIC_API_KEY environment variable."
+            )
+        return key
+
+    @cached_property
+    def header(self):
+        return {
+            "x-api-key": f"{self.api_key}",
+            "anthropic-version": self.anthropic_version,
+        }
+
+    def _create_payload(
+        self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs
+    ) -> dict:
+        system = (
+            messages[0].get("content") if messages[0].get("role") == "system" else None
+        )
+        if system:
+            messages = messages[1:]
+        gen_kwargs.pop("do_sample", False)
+        max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = gen_kwargs.pop("until", ["\n\nHuman:"])
+        if not isinstance(stop, list):
+            stop = [stop]
+        out = {
+            "messages": messages,
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stop_sequences": stop,
+            **gen_kwargs,
+        }
+        if system:
+            out["system"] = system
+        return out
+
+    def parse_generations(
+        self, outputs: Union[Dict, List[Dict]], **kwargs
+    ) -> List[str]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["content"]:
+                res.append(choices["text"])
+        return res
+
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len=None,
+        add_special_tokens=None,
+        **kwargs,
+    ) -> List[str]:
+        return [string]
+
+    def loglikelihood(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Anthropic Chat Completions API does not support the return of loglikelihood"
+        )
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/api_models.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/api_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff72925cad9e53d090d42b3c0794edb784e1d614
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/api_models.py
@@ -0,0 +1,641 @@
+import abc
+import asyncio
+import copy
+import itertools
+import json
+from functools import cached_property
+from typing import (
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Union,
+)
+
+
+try:
+    import requests
+    from aiohttp import ClientSession, TCPConnector
+    from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
+    from tqdm import tqdm
+    from tqdm.asyncio import tqdm_asyncio
+except ModuleNotFoundError:
+    pass
+
+
+from importlib.util import find_spec
+
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.models.utils import Collator, chunks, configure_pad_token
+
+
+LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
+
+
+# utility class to keep track of json encoded chats
+class JsonChatStr(NamedTuple):
+    prompt: str
+
+    def encode(self, encoding):
+        return self.prompt.encode(encoding)
+
+
+eval_logger = utils.eval_logger
+
+
+class TemplateAPI(TemplateLM):
+    def __init__(
+        self,
+        model: str = None,
+        pretrained: str = None,  # `model` takes precedence over `pretrained` when passed.
+        base_url: str = None,
+        tokenizer: Optional[str] = None,
+        # Logliklehood tasks require a tokenizer to calculate context lengths,
+        # however the requests can be sent as a string if the API doesn't support token inputs.
+        # use tokenized_requests=False
+        tokenizer_backend: Optional[
+            Literal["tiktoken", "huggingface", None]
+        ] = "huggingface",
+        truncate: bool = False,
+        # number of concurrent requests. More useful if not batching
+        num_concurrent: int = 1,
+        max_retries: int = 3,
+        max_gen_toks: int = 256,
+        batch_size: Union[str, int] = 1,
+        seed: int = 1234,
+        max_length: Optional[int] = 2048,
+        add_bos_token: bool = False,
+        custom_prefix_token_id=None,
+        # send the requests as tokens or strings
+        tokenized_requests=True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        missing_packages = [
+            pkg
+            for pkg in ["aiohttp", "tqdm", "tenacity", "requests"]
+            if find_spec(pkg) is None
+        ]
+        if missing_packages:
+            raise ModuleNotFoundError(
+                f"Attempted to use an API model, but the required packages {missing_packages} are not installed. "
+                'Please install these via `pip install lm-eval[api]` or `pip install -e ."[api]"`'
+            )
+        self.model = model or pretrained
+        self.base_url = base_url
+        self.tokenizer = tokenizer
+        if not isinstance(batch_size, int) and "auto" in batch_size:
+            eval_logger.warning(
+                "Automatic batch size is not supported for API models. Defaulting to batch size 1."
+            )
+        elif int(batch_size) > 1:
+            eval_logger.warning(
+                "Batch size > 1 detected. Ensure your API supports batched requests with varying total sequence lengths."
+            )
+        self._batch_size = int(batch_size) if batch_size != "auto" else 1
+        self._truncate = truncate
+        self._max_gen_toks = int(max_gen_toks)
+        self._seed = int(seed)
+        self.max_length = max_length
+        if int(num_concurrent) <= 1:
+            eval_logger.info(
+                "Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1."
+            )
+        self._concurrent = int(num_concurrent)
+        self.tokenizer_backend = tokenizer_backend
+        self.add_bos_token = add_bos_token
+        self.custom_prefix_token_id = custom_prefix_token_id
+        self.tokenized_requests = tokenized_requests
+        self.max_retries = int(max_retries)
+
+        eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
+        if self.tokenizer_backend is None:
+            self.tokenizer = None
+            self.tokenized_requests = False
+        else:
+            if self.tokenizer is None:
+                if self.tokenizer_backend == "huggingface":
+                    import transformers
+
+                    self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                        self.tokenizer if self.tokenizer else self.model
+                    )
+                    # Not used as the API will handle padding but to mirror the behavior of the HFLM
+                    self.tokenizer = configure_pad_token(self.tokenizer)
+                elif self.tokenizer_backend == "tiktoken":
+                    try:
+                        import tiktoken
+
+                        self.tokenizer = tiktoken.encoding_for_model(self.model)
+                    except ModuleNotFoundError as e:
+                        raise Exception(
+                            "Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. "
+                            "Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`."
+                        ) from e
+                    if "openai" not in self.base_url:
+                        eval_logger.warning(
+                            f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
+                            "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
+                        )
+            else:
+                import transformers
+
+                assert isinstance(tokenizer, str), "tokenizer must be a string"
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    tokenizer,
+                )
+
+    @abc.abstractmethod
+    def _create_payload(
+        self,
+        messages: Union[List[List[int]], List[dict], List[str], str],
+        *,
+        generate: bool = True,
+        gen_kwargs: Optional[dict] = None,
+        seed: int = 1234,
+        **kwargs,
+    ) -> dict:
+        """This method is responsible for creating the json payload that will be sent to the API."""
+        raise NotImplementedError
+
+    def create_message(
+        self,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        generate=False,
+    ) -> Union[List[List[int]], List[dict], List[str], str]:
+        """Helper method to transform the prompt into the expected API input format. messages consist of batched requests"""
+        if isinstance(messages[0], JsonChatStr):
+            # for chat completions we need to decode the json string to list[dict,...]
+            assert (
+                self._batch_size == 1
+            ), "non-tokenized chat requests are only supported with batch_size=1"
+            # list[dict["role":..., "content":...],...]
+            return json.loads(messages[0].prompt)
+
+        if not self.tokenized_requests:
+            # if messages are tokenized:
+            if isinstance(messages[0][0], int):
+                # assuming decoding is lossless. However, this is only for logliklehood requests
+                # as we need to compute the context length. For generations, we don't need to tokenize.
+                messages = self.decode_batch(messages)
+            if self._batch_size <= 1:
+                # if batch is 1 return str
+                return messages[0]
+            else:
+                # list[str,...]
+                return messages
+
+        # list[list[int], ...]
+        return messages
+
+    @staticmethod
+    @abc.abstractmethod
+    def parse_logprobs(
+        outputs: Union[Any, List[Any]],
+        tokens: List[List[int]] = None,
+        ctxlen: List[int] = None,
+        **kwargs,
+    ) -> List[Tuple[float, bool]]:
+        """Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples"""
+        raise NotImplementedError
+
+    @staticmethod
+    @abc.abstractmethod
+    def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
+        """Method used to parse the generations from the (batched) API response. This method should return a list of str"""
+        raise NotImplementedError
+
+    @cached_property
+    def api_key(self) -> str:
+        """Override this property to return the API key for the API request."""
+        return ""
+
+    @cached_property
+    def header(self) -> dict:
+        """Override this property to return the headers for the API request."""
+        return {"Authorization": f"Bearer {self.api_key}"}
+
+    @property
+    def chat_template(self) -> str:
+        """Must be defined for LM subclasses that implement Chat Templating.
+        Should return the structure of the chat template applied to user/assistant messages.
+        Only used for logging and reproducibility.
+        """
+        return ""
+
+    @property
+    def tokenizer_name(self) -> str:
+        """Must be defined for LM subclasses which implement Chat Templating.
+        Should return the name of the tokenizer or chat template used.
+        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
+        """
+        return ""
+
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]]
+    ) -> Union[str, JsonChatStr]:
+        """Applies a chat template to a list of chat history between user and model."""
+        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
+            return self.tokenizer.apply_chat_template(
+                chat_history, tokenize=False, add_generation_prompt=True
+            )
+        else:
+            # bit of a hack. We'll load back before sending to the API
+            return JsonChatStr(json.dumps(chat_history))
+
+    @cached_property
+    def eot_token_id(self) -> Optional[int]:
+        if self.tokenizer is None:
+            return None
+        else:
+            if self.tokenizer_backend == "huggingface":
+                return self.tokenizer.eos_token_id
+            elif self.tokenizer_backend == "tiktoken":
+                return self.tokenizer.eot_token
+
+    @cached_property
+    def prefix_token_id(self) -> Optional[int]:
+        if self.tokenizer is None:
+            return None
+        else:
+            if self.custom_prefix_token_id is not None:
+                return self.custom_prefix_token_id
+            if self.tokenizer_backend == "huggingface":
+                if self.tokenizer.bos_token_id is not None:
+                    return self.tokenizer.bos_token_id
+                return self.tokenizer.eos_token_id
+            else:
+                return self.tokenizer.eot_token
+
+    def tok_encode(
+        self,
+        string: str,
+        left_truncate_len: int = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        **kwargs,
+    ) -> Union[List[List[int]], List[int], List[str]]:
+        if self.tokenizer_backend is None:
+            return [string]
+        elif self.tokenizer_backend == "huggingface":
+            # by default for CausalLM - false or self.add_bos_token is set
+            if not add_special_tokens:
+                add_special_tokens = False or self.add_bos_token
+            encoding: Union[List[List[int]], List[int]] = self.tokenizer(
+                string,
+                add_special_tokens=add_special_tokens,
+                truncation=truncation,
+                return_attention_mask=False,
+            ).input_ids
+
+            # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+            if left_truncate_len:
+                if not isinstance(string, str):
+                    encoding = [enc[-left_truncate_len:] for enc in encoding]
+                else:
+                    encoding = encoding[-left_truncate_len:]
+
+            return encoding
+
+        else:
+            try:
+                encoding = self.tokenizer.encode(string)
+            except Exception:
+                encoding = self.tokenizer.encode_batch(string)
+            return encoding
+
+    def decode_batch(self, tokens: List[List[int]]) -> List[str]:
+        if self.tokenizer_backend == "huggingface":
+            return self.tokenizer.batch_decode(tokens)
+        elif self.tokenizer_backend == "tiktoken":
+            return self.tokenizer.decode_batch(tokens)
+
+    def model_call(
+        self,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        *,
+        generate: bool = True,
+        gen_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Optional[dict]:
+        # !!! Copy: shared dict for each request, need new object !!!
+        gen_kwargs = copy.deepcopy(gen_kwargs)
+        try:
+            response = requests.post(
+                self.base_url,
+                json=self._create_payload(
+                    self.create_message(messages),
+                    generate=generate,
+                    gen_kwargs=gen_kwargs,
+                    seed=self._seed,
+                    **kwargs,
+                ),
+                headers=self.header,
+            )
+            if not response.ok:
+                eval_logger.warning(
+                    f"API request failed with error message: {response.text}. Retrying..."
+                )
+            response.raise_for_status()
+            return response.json()
+        except RetryError:
+            eval_logger.error(
+                "API request failed after multiple retries. Please check the API status."
+            )
+            return None
+
+    async def amodel_call(
+        self,
+        session: ClientSession,
+        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
+        *,
+        generate: bool = True,
+        cache_keys: list = None,
+        ctxlens: Optional[List[int]] = None,
+        gen_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Union[List[str], List[Tuple[float, bool]], None]:
+        # !!! Copy: shared dict for each request, need new object !!!
+        gen_kwargs = copy.deepcopy(gen_kwargs)
+        payload = self._create_payload(
+            self.create_message(messages),
+            generate=generate,
+            gen_kwargs=gen_kwargs,
+            seed=self._seed,
+            **kwargs,
+        )
+        cache_method = "generate_until" if generate else "loglikelihood"
+        try:
+            async with session.post(
+                self.base_url,
+                json=payload,
+                headers=self.header,
+            ) as response:
+                if not response.ok:
+                    error_text = await response.text()
+                    eval_logger.warning(
+                        f"API request failed with error message: {error_text}. Retrying..."
+                    )
+                # raising exception will retry the request
+                response.raise_for_status()
+                outputs = await response.json()
+            answers = (
+                self.parse_generations(
+                    outputs=outputs,
+                )
+                if generate
+                else self.parse_logprobs(
+                    outputs=outputs,
+                    tokens=messages,
+                    ctxlens=ctxlens,
+                )
+            )
+            if cache_keys:
+                for res, cache in zip(answers, cache_keys):
+                    self.cache_hook.add_partial(cache_method, cache, res)
+            return answers
+        # If the retries also fail
+        except RetryError:
+            eval_logger.error(
+                "API request failed after multiple retries. Please check the API status."
+            )
+            return None
+
+    def batch_logliklehood_requests(
+        self, chunks: Iterable[List[LogLikelihoodInputs]]
+    ) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]:
+        inputs = []
+        ctxlens = []
+        cache_keys = []
+        for chunk in chunks:
+            for cache_key, context_enc, continuation_enc in chunk:
+                inp = (context_enc + continuation_enc)[-(self.max_length) :]
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length)
+                )
+
+                inputs.append(inp)
+                ctxlens.append(ctxlen)
+                cache_keys.append(cache_key)
+        return inputs, ctxlens, cache_keys
+
+    async def get_batched_requests(
+        self,
+        requests: list,
+        cache_keys: list,
+        *,
+        generate: bool = True,
+        ctxlens: List[int] = None,
+        **kwargs,
+    ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
+        ctxlens = ctxlens if ctxlens else [None] * len(requests)
+        conn = TCPConnector(limit=self._concurrent)
+        async with ClientSession(connector=conn) as session:
+            retry_: Callable[..., Awaitable[Any]] = retry(
+                stop=stop_after_attempt(self.max_retries),
+                wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                reraise=True,
+            )(self.amodel_call)
+            # Create tasks for each batch of request
+            tasks = [
+                asyncio.create_task(
+                    retry_(
+                        session=session,
+                        messages=message,
+                        cache_keys=cache_key,
+                        generate=generate,
+                        ctxlens=ctxlen,
+                        **kwargs,
+                    )
+                )
+                for message, cache_key, ctxlen in zip(
+                    chunks(requests, n=self._batch_size),
+                    chunks(cache_keys, n=self._batch_size),
+                    chunks(ctxlens, n=self._batch_size),
+                )
+            ]
+
+            return await tqdm_asyncio.gather(*tasks, desc="Requesting API")
+
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        assert (
+            self.tokenizer is not None
+        ), "Tokenizer is required for loglikelihood tasks to compute context lengths."
+        res = []
+
+        def _collate(req: LogLikelihoodInputs):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by=None,
+        )
+        # if concurrent then we'll batch in the async context
+        chunked = re_ord.get_batched(n=self._batch_size if self._concurrent <= 1 else 0)
+        if self._concurrent <= 1:
+            pbar = tqdm(desc="Requesting API", total=len(requests))
+            for chunk in chunked:
+                inputs, ctxlens, cache_keys = self.batch_logliklehood_requests([chunk])
+
+                outputs = retry(
+                    stop=stop_after_attempt(self.max_retries),
+                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                    reraise=True,
+                )(self.model_call)(messages=inputs, generate=False)
+                if isinstance(outputs, dict):
+                    outputs = [outputs]
+                for answer_, cache_key in zip(
+                    self.parse_logprobs(
+                        outputs=outputs, tokens=inputs, ctxlens=ctxlens
+                    ),
+                    cache_keys,
+                ):
+                    if answer_ is not None:
+                        res.append(answer_)
+                        # partial caching
+                        if cache_key is not None:
+                            self.cache_hook.add_partial(
+                                "loglikelihood", cache_key, answer_
+                            )
+                        pbar.update(1)
+        else:
+            inputs, ctxlens, cache_keys = self.batch_logliklehood_requests(chunked)
+            res = itertools.chain.from_iterable(
+                asyncio.run(
+                    self.get_batched_requests(
+                        inputs, cache_keys, generate=False, ctxlens=ctxlens
+                    )
+                )
+            )
+
+        return re_ord.get_original(res)
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+
+        def _collate_gen(_requests):
+            # sort by the length of the non-tokenized contexts
+            return -len(_requests[0])
+
+        # Let the API deal with tokenization
+        requests, all_gen_kwargs = zip(*(req.args for req in requests))
+        if self.tokenized_requests:
+            encodings_list = self.tok_encode(
+                requests, add_special_tokens=self.add_bos_token
+            )
+        else:
+            encodings_list = [None] * len(requests)
+        requests = [
+            (a, b, c) for a, b, c in zip(requests, all_gen_kwargs, encodings_list)
+        ]
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate_gen,
+            group_by="gen_kwargs",
+        )
+        chunked = re_ord.get_batched(
+            n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None
+        )
+        if self._concurrent <= 1:
+            pbar = tqdm(desc="Requesting API", total=len(requests))
+            for chunk in chunked:
+                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                req = encodings_list if self.tokenized_requests else contexts
+                outputs = retry(
+                    stop=stop_after_attempt(self.max_retries),
+                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
+                    reraise=True,
+                )(self.model_call)(
+                    messages=req,
+                    generate=True,
+                    gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
+                )
+                for generated_text, context in zip(
+                    self.parse_generations(
+                        outputs=outputs,
+                        contexts=contexts,
+                    ),
+                    contexts,
+                ):
+                    if generated_text is not None:
+                        res.append(generated_text)
+
+                        # partial caching
+                        if context is not None:
+                            self.cache_hook.add_partial(
+                                "generate_until",
+                                (context, all_gen_kwargs[0]),
+                                generated_text,
+                            )
+                            pbar.update(1)
+        else:
+            for chunk in chunked:
+                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
+                req = encodings_list if self.tokenized_requests else contexts
+                results = itertools.chain.from_iterable(
+                    asyncio.run(
+                        self.get_batched_requests(
+                            req,
+                            cache_keys=[(ctx, all_gen_kwargs[0]) for ctx in contexts],
+                            generate=True,
+                            gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
+                        )
+                    )
+                )
+                res.extend(results)
+
+        return re_ord.get_original(res)
+
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.prefix_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+                disable_tqdm=True,
+            )
+
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/huggingface.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/huggingface.py
new file mode 100644
index 0000000000000000000000000000000000000000..afbcf52b1a5ac38f0bb1395cef82593a23524566
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/huggingface.py
@@ -0,0 +1,1356 @@
+import copy
+import os
+from datetime import timedelta
+from pathlib import Path
+from typing import Dict, List, Literal, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import transformers
+from accelerate import (
+    Accelerator,
+    InitProcessGroupKwargs,
+    find_executable_batch_size,
+)
+from accelerate.utils import get_max_memory
+from huggingface_hub import HfApi
+from packaging import version
+from peft import PeftModel
+from peft import __version__ as PEFT_VERSION
+from tqdm import tqdm
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+)
+
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import TemplateLM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import (
+    Collator,
+    clear_torch_cache,
+    configure_pad_token,
+    get_dtype,
+    pad_and_concat,
+    stop_sequences_criteria,
+)
+
+
+eval_logger = utils.eval_logger
+
+
+@register_model("hf-auto", "hf", "huggingface")
+class HFLM(TemplateLM):
+    """
+    An abstracted Huggingface model class. Enables usage with both models of
+    `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.
+
+    Supports data-parallel multi-GPU with HF Accelerate.
+    """
+
+    AUTO_MODEL_CLASS = None
+    _DEFAULT_MAX_LENGTH = 2048
+
+    def __init__(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
+        revision: Optional[str] = "main",
+        subfolder: Optional[str] = None,
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ] = None,
+        truncation: Optional[bool] = False,
+        logits_cache: bool = True,
+        max_length: Optional[int] = None,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        batch_size: Optional[Union[int, str]] = 1,
+        max_batch_size: Optional[int] = 64,
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        add_bos_token: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        parallelize: Optional[bool] = False,
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
+        # PEFT, delta weights and quantization options
+        peft: Optional[str] = None,
+        delta: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        # optionally: take in an already-initialized transformers.PreTrainedModel
+        if not isinstance(pretrained, str):
+            eval_logger.warning(
+                "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
+            )
+            assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            self._model = pretrained
+            self._device = self._model.device
+            self._config = self._model.config
+            gpus = 0
+
+        else:
+            assert isinstance(device, str)
+            assert isinstance(pretrained, str)
+            assert isinstance(batch_size, (int, str))
+
+            gpus = torch.cuda.device_count()
+            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+            if accelerator.num_processes > 1:
+                self.accelerator = accelerator
+
+            if "npu" in accelerator.device.type:
+                gpus = torch.npu.device_count()
+
+            # using one process with no model parallelism
+            if not (parallelize or accelerator.num_processes > 1):
+                # use user-passed device
+                device_list = set(
+                    ["cuda", "cpu"]
+                    + [f"cuda:{i}" for i in range(gpus)]
+                    + ["mps", "mps:0"]
+                    + [f"npu:{i}" for i in range(gpus)]
+                )
+                if device and device in device_list:
+                    self._device = torch.device(device)
+                    eval_logger.info(f"Using device '{device}'")
+                    if device in ("mps", "mps:0") and version.parse(
+                        torch.__version__
+                    ) < version.parse("2.1"):
+                        raise RuntimeError(
+                            f"mps requires torch >= 2.1. You have {torch.__version__}"
+                        )
+                else:
+                    eval_logger.info("Device not specified")
+                    eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
+                    self._device = (
+                        torch.device("cuda")
+                        if torch.cuda.is_available()
+                        else torch.device("cpu")
+                    )
+            else:  # Parallelism managed by accelerate
+                if device != "cuda":
+                    eval_logger.info(
+                        f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
+                    )
+                # TODO: include in warning that `load_in_8bit` etc. affect this too
+                self._device = (
+                    self.accelerator.device
+                    if hasattr(self, "accelerator")
+                    else torch.device(device)
+                )
+
+            revision = str(revision)  # cast to string if not already one
+            # TODO: update this to be less of a hack once subfolder is fixed in HF
+            revision = revision + ("/" + subfolder if subfolder is not None else "")
+
+            self._get_config(
+                pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+            )
+
+        # determine which of 'causal' and 'seq2seq' backends to use
+        self._get_backend(
+            config=self.config, backend=backend, trust_remote_code=trust_remote_code
+        )
+
+        # load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
+        self._create_tokenizer(
+            pretrained,
+            tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast_tokenizer=use_fast_tokenizer,
+        )
+
+        # if we passed `pretrained` as a string, initialize our model now
+        if isinstance(pretrained, str):
+            self._create_model(
+                pretrained=pretrained,
+                revision=revision,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                parallelize=parallelize,
+                gpus=gpus,
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                peft=peft,
+                delta=delta,
+                autogptq=autogptq,
+                **kwargs,
+            )
+
+        # access self._model through self.model property outside this method
+        if isinstance(self.model, torch.nn.Module):
+            self.model.eval()
+            self.model.tie_weights()
+
+        self.truncation = truncation
+        self.logits_cache = logits_cache
+        self.vocab_size = self.tokenizer.vocab_size
+        # select (or create) a pad token to use
+        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
+
+        self.add_bos_token = add_bos_token
+        if "gemma" in getattr(self.config, "model_type", ""):
+            self.add_bos_token = True
+            eval_logger.info(
+                f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
+            )
+
+        self._max_length = max_length
+        self.pretrained = pretrained
+        self.delta = delta
+        self.peft = peft
+        self.revision = revision
+        self.batch_schedule = 1
+        self.batch_sizes = {}
+        self.max_batch_size = max_batch_size
+
+        if str(batch_size).startswith("auto"):
+            batch_size = batch_size.split(":")
+            self.batch_size_per_gpu = batch_size[0]
+            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
+        else:
+            self.batch_size_per_gpu = int(batch_size)
+
+        if isinstance(pretrained, str):
+            if gpus >= 1 or str(self.device) == "mps":
+                # TODO: can remove this whole snippet except in the mps case, perhaps?
+                if not (parallelize or autogptq or hasattr(self, "accelerator")):
+                    # place model onto device requested manually,
+                    # if not using HF Accelerate or device_map
+                    # or any other option that preloads model onto device
+                    try:
+                        self.model.to(self.device)
+                    except ValueError:
+                        eval_logger.debug(
+                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                        )
+            # multigpu data-parallel support when launched with accelerate
+            if gpus > 1:
+                if accelerator.num_processes > 1:
+                    if parallelize:
+                        eval_logger.warning(
+                            "You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
+                        )
+                    elif gpus > accelerator.num_processes:
+                        eval_logger.warning(
+                            "WARNING: The number of total system GPUs does not match the number of spawned processes. "
+                            "If you would like to use data parallelism, please launch the script "
+                            "with 'accelerate launch *script*'. "
+                            f"Current run will proceed with {accelerator.num_processes} devices."
+                        )
+                        if self.accelerator.is_local_main_process:
+                            eval_logger.info(
+                                f"Using {gpus} devices with data parallelism"
+                            )
+
+                    self._device = torch.device(f"{accelerator.device}")
+                    self.accelerator = accelerator
+
+                    self._rank = self.accelerator.local_process_index
+                    self._world_size = self.accelerator.num_processes
+                else:
+                    # if we aren't launching via accelerate, ditch
+                    self._rank = 0
+                    self._world_size = 1
+        else:
+            # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
+            eval_logger.warning(
+                "Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration"
+            )
+            self._rank = 0
+            self._world_size = 1
+
+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(
+                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
+            )
+
+    def _get_accelerate_args(
+        self,
+        parallelize: bool = None,
+        device_map: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        gpus: Optional[int] = None,
+    ) -> dict:
+        """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
+        num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
+        num_machines = int(os.environ.get("WORLD_SIZE", 0)) // num_local_processes
+        if (
+            num_machines == 0
+            and hasattr(self, "accelerator")
+            and self.accelerator is not None
+        ):
+            eval_logger.info(
+                "We are not in a distributed setting for accelerate. Setting model_parallel to False."
+            )
+            parallelize = False
+
+        if parallelize is None:
+            # If parallelism is unset by the user, we automatically assign model parallelism
+            # if enough extra GPUs are available
+            max_memory_all_gpus = get_max_memory()
+            # We just want gpu, not cpu, max memory
+            if "cpu" in max_memory_all_gpus:
+                del max_memory_all_gpus["cpu"]
+            parallelize = bool(num_local_processes < len(max_memory_all_gpus))
+            eval_logger.info(
+                f"Setting model parallel to {parallelize} since "
+                f"the number of local processes is {num_local_processes} "
+                f"and the number of GPUs is {len(max_memory_all_gpus)}"
+            )
+
+        args = {}
+        if parallelize:  # Model parallelism will be used
+            max_memory = {}
+            if max_memory_per_gpu is not None:  # Using the provided memory requirements
+                max_memory_per_gpu_map = {
+                    device_idx: max_memory_per_gpu for device_idx in range(gpus)
+                }
+            else:  # Estimating the possible memory requirements
+                max_memory_all_gpus = get_max_memory()
+                if "cpu" in max_memory_all_gpus:
+                    del max_memory_all_gpus["cpu"]
+                if not hasattr(self, "accelerator"):
+                    max_memory_per_gpu_map = {
+                        k: v for k, v in max_memory_all_gpus.items()
+                    }
+                else:
+                    # use only 1 / num_processes of the GPUs if we are running under accelerate launch
+                    max_memory_per_gpu_map = {
+                        k: v
+                        for k, v in max_memory_all_gpus.items()
+                        if k % num_local_processes
+                        == (self.accelerator.process_index % num_local_processes)
+                    }
+            args["max_memory"] = max_memory_per_gpu_map
+            args["device_map"] = "auto"
+            eval_logger.info(
+                f"Model parallel was set to True, setting max memory per GPU to {max_memory_per_gpu_map} and device map to 'auto'"
+            )
+
+            if max_cpu_memory is not None:
+                max_memory["cpu"] = max_cpu_memory
+
+            args["offload_folder"] = offload_folder
+        elif (
+            device_map is None
+        ):  # No model parallelism, we use the default provided device for our model
+            if hasattr(self, "accelerator"):
+                device_map = {"": f"{self.accelerator.device}"}
+            else:
+                device_map = {"": str(self.device)}
+            args["max_memory"] = None
+            args["device_map"] = device_map
+            eval_logger.info(
+                f"Model parallel was set to False, max memory was not set, and device map was set to {device_map}"
+            )
+        else:
+            args["max_memory"] = None
+            args["device_map"] = None
+            eval_logger.info("Model parallel was set to False.")
+
+        return args
+
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+
+    @property
+    def model(self):
+        # returns the model, unwrapping it if using Accelerate
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        if self._max_length:  # if max length manually set, return it
+            return self._max_length
+        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+        for attr in seqlen_config_attrs:
+            if hasattr(self.model.config, attr):
+                return getattr(self.model.config, attr)
+        if hasattr(self.tokenizer, "model_max_length"):
+            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+                return self._DEFAULT_MAX_LENGTH
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+
+    @property
+    def chat_template(self) -> str:
+        if self.tokenizer.chat_template is not None:
+            return self.tokenizer.chat_template
+        return self.tokenizer.default_chat_template
+
+    def _get_backend(
+        self,
+        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
+        backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
+        trust_remote_code: Optional[bool] = False,
+    ) -> None:
+        """
+        Helper method during initialization.
+        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
+        model type to be used.
+        """
+        assert backend in ["default", "causal", "seq2seq"]
+
+        if backend != "default":
+            # if we've settled on non-default backend, use that manually
+            if backend == "causal":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+            elif backend == "seq2seq":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+            eval_logger.info(
+                f"Overrode HF model backend type, and using type '{backend}'"
+            )
+        else:
+            # determine and use the default HF backend for this model, based on its config + metadata.
+            if (
+                getattr(config, "model_type")
+                in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+            ):
+                # first check if model type is listed under seq2seq models, since some
+                # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
+                # these special cases should be treated as seq2seq models.
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+            elif (
+                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+            ):
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+            else:
+                if not trust_remote_code:
+                    eval_logger.warning(
+                        "HF model type is neither marked as CausalLM or Seq2SeqLM. \
+                    This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
+                    )
+                # if model type is neither in HF transformers causal or seq2seq model registries
+                # then we default to AutoModelForCausalLM
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+
+        assert self.AUTO_MODEL_CLASS in [
+            transformers.AutoModelForCausalLM,
+            transformers.AutoModelForSeq2SeqLM,
+        ]
+        return None
+
+    def _get_config(
+        self,
+        pretrained: str,
+        revision: str = "main",
+        trust_remote_code: bool = False,
+    ) -> None:
+        self._config = transformers.AutoConfig.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+
+    def _create_model(
+        self,
+        pretrained: str,
+        revision: Optional[str] = "main",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        trust_remote_code: Optional[bool] = False,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        # (accelerate naive PP (device_map) options)
+        parallelize: Optional[bool] = False,
+        gpus: Optional[int] = None,
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        # PEFT, delta weights and quantization options
+        peft: Optional[str] = None,
+        delta: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        **kwargs,
+    ) -> None:
+        """
+        Initializes an HF or HF-compatible PreTrainedModel from scratch
+        inside HFLM, using the kwargs passed into self.__init__().
+
+        Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
+
+        For future similar extensions to AutoGPTQ that are not core to HF's ecosystem,
+        (such as PyTorch models that are nearly, but not quite, fully mirroring
+        HF's public interface relied on in this HFLM class)
+        please consider subclassing HFLM and overriding this and other methods as needed.
+        """
+
+        model_kwargs = kwargs if kwargs else {}
+
+        model_kwargs.update(
+            self._get_accelerate_args(
+                parallelize=parallelize,
+                device_map=kwargs.get("device_map", None),
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                gpus=gpus,
+            )
+        )
+
+        if not autogptq:
+            if model_kwargs.get("load_in_4bit", None):
+                assert (
+                    transformers.__version__ >= "4.30.0"
+                ), "load_in_4bit requires transformers >= 4.30.0"
+            if transformers.__version__ >= "4.30.0":
+                if model_kwargs.get("load_in_4bit", None):
+                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
+                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
+                            model_kwargs["bnb_4bit_compute_dtype"]
+                        )
+
+            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
+                pretrained,
+                revision=revision,
+                torch_dtype=get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+        else:
+            try:
+                from auto_gptq import AutoGPTQForCausalLM
+            except ModuleNotFoundError:
+                raise Exception(
+                    "Tried to load auto_gptq, but auto-gptq is not installed ",
+                    "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
+                )
+
+            self._model = AutoGPTQForCausalLM.from_quantized(
+                pretrained,
+                trust_remote_code=trust_remote_code,
+                model_basename=None if autogptq is True else Path(autogptq).stem,
+                use_safetensors=True
+                if autogptq is True
+                else autogptq.endswith(".safetensors"),
+                **model_kwargs,
+            )
+
+        if peft and delta:
+            raise ValueError(
+                "Cannot use both 'peft' and 'delta' options at the same time."
+            )
+
+        if peft:
+            if model_kwargs.get("load_in_4bit", None):
+                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
+                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
+            if self._model.config.vocab_size != len(self.tokenizer):
+                # resize model for LoRAs with added tokens
+                self._model.resize_token_embeddings(len(self.tokenizer))
+                eval_logger.info(
+                    f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
+                )
+            self._model = PeftModel.from_pretrained(
+                self._model, peft, revision=revision
+            )
+        elif delta:
+            if autogptq:
+                eval_logger.warning(
+                    "Delta weights might trigger unexpected behavior when used with AutoGPTQ."
+                )
+            _model_delta = self.AUTO_MODEL_CLASS.from_pretrained(
+                delta,
+                revision=revision,
+                torch_dtype=get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+            for name, param in self._model.state_dict().items():
+                try:
+                    param.data += _model_delta.state_dict()[name]
+                except KeyError:
+                    raise KeyError(f"Delta model is missing weights for layer: {name}")
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to add delta weights to layer {name}. Error: {e}"
+                    )
+
+            del _model_delta
+
+        return None
+
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ],
+        revision: Optional[str] = "main",
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+    ) -> None:
+        """
+        Helper method during initialization.
+
+        Create a tokenizer object corresponding to the correct
+        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
+        """
+
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    tokenizer,
+                    revision=revision,
+                    trust_remote_code=trust_remote_code,
+                    use_fast=use_fast_tokenizer,
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                self.tokenizer = tokenizer
+        else:
+            # Get tokenizer based on 'pretrained'
+            if isinstance(pretrained, str):
+                model_name = pretrained
+            else:
+                # get the HF hub name via accessor on model
+                model_name = self.model.name_or_path
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                model_name,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                use_fast=use_fast_tokenizer,
+            )
+        return None
+
+    def _detect_batch_size(self, requests=None, pos: int = 0):
+        if requests:
+            _, context_enc, continuation_enc = requests[pos]
+            max_length = len(
+                (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+            )
+            max_context_enc = len(context_enc[-(self.max_length + 1) :])
+            max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
+        else:
+            max_length = self.max_length
+            max_context_enc = max_length
+            max_cont_enc = max_length
+
+        # if OOM, then halves batch_size and tries again
+        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
+        def forward_batch(batch_size):
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                length = max(max_context_enc, max_cont_enc)
+                batched_conts = torch.ones(
+                    (batch_size, length), device=self.device
+                ).long()
+                test_batch = torch.ones((batch_size, length), device=self.device).long()
+                call_kwargs = {
+                    "attn_mask": test_batch,
+                    "labels": batched_conts,
+                }
+            else:
+                call_kwargs = {}
+                test_batch = torch.ones(
+                    (batch_size, max_length), device=self.device
+                ).long()
+            for _ in range(5):
+                out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1)  # noqa: F841
+
+            return batch_size
+
+        try:
+            batch_size = forward_batch()
+        except RuntimeError as e:
+            if "No executable batch size found" in str(e):
+                batch_size = 1
+            else:
+                raise
+
+        if self.world_size > 1:
+            # if multi-GPU, always take minimum over all selected batch sizes
+            max_rnk_bs = torch.tensor([batch_size], device=self.device)
+            gathered = (
+                self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
+            )
+            batch_size = min(gathered)
+            clear_torch_cache()
+            return batch_size
+
+        clear_torch_cache()
+        return batch_size
+
+    def tok_encode(
+        self, string: str, left_truncate_len=None, add_special_tokens=None
+    ) -> List[int]:
+        """ """
+        # default for None - empty dict, use predefined tokenizer param
+        # used for all models except for CausalLM or predefined value
+        special_tokens_kwargs = {}
+
+        # by default for CausalLM - false or self.add_bos_token is set
+        if add_special_tokens is None:
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                special_tokens_kwargs = {
+                    "add_special_tokens": False or self.add_bos_token
+                }
+        # otherwise the method explicitly defines the value
+        else:
+            special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
+
+        encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
+
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+
+        return encoding
+
+    def tok_batch_encode(
+        self,
+        strings: List[str],
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+
+        add_special_tokens = {}
+        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+
+        encoding = self.tokenizer(
+            strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            **add_special_tokens,
+        )
+        if left_truncate_len:
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side
+
+        return encoding["input_ids"], encoding["attention_mask"]
+
+    def tok_decode(self, tokens, skip_special_tokens=True):
+        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
+
+    def _model_call(self, inps, attn_mask=None, labels=None):
+        """
+        :param inps: torch.Tensor
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
+            [batch, sequence_ctx]. the size of sequence may vary from call to call
+        :param attn_mask: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :param labels: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :return
+            A torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model's decoder
+        """
+        with torch.no_grad():
+            if attn_mask is not None or labels is not None:
+                assert attn_mask is not None and labels is not None
+                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
+                return self.model(
+                    input_ids=inps, attention_mask=attn_mask, labels=labels
+                ).logits
+            else:
+                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                return self.model(inps).logits
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        # temperature = 0.0 if not set
+        # if do_sample is false and temp==0.0:
+        # remove temperature, as do_sample=False takes care of this
+        # and we don't want a warning from HF
+        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", None)
+
+        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+            generation_kwargs["do_sample"] = do_sample = False
+
+        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+            generation_kwargs.pop("temperature")
+        # build stopping criteria
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer, stop, context.shape[1], context.shape[0]
+        )
+        return self.model.generate(
+            input_ids=context,
+            max_length=max_length,
+            stopping_criteria=stopping_criteria,
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=True,
+            **generation_kwargs,
+        )
+
+    def _select_cont_toks(
+        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
+    ) -> torch.Tensor:
+        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+            assert (
+                contlen and inplen
+            ), "Must pass input len and cont. len to select scored logits for causal LM"
+            # discard right-padding.
+            # also discard the input/context tokens. we'll only score continuations.
+            logits = logits[inplen - contlen : inplen]
+        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+            assert (
+                contlen and not inplen
+            ), "Selecting scored logits for Seq2SeqLM requires only cont. len"
+            # only discard right-padding.
+            # the logits input to this fn only contain decoder-side tokens.
+            logits = logits[:contlen]
+
+        return logits
+
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
+
+        for (string,) in tqdm(
+            [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
+        ):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.prefix_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            pad_amnt = 0
+            if self.world_size > 1:
+                # We pad out the external document-level iterator so the inner iterator doesn't hang
+                mytensor = torch.tensor(len(rolling_token_windows), device=self.device)
+                gathered = (
+                    self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
+                )
+
+                pad_amnt = max(gathered) - gathered[self.rank]
+                if pad_amnt > 0:
+                    rolling_token_windows += pad_amnt * [rolling_token_windows[0]]
+
+            string_nll = self._loglikelihood_tokens(
+                requests=rolling_token_windows,
+                disable_tqdm=True,
+                override_bs=adaptive_batch_size,
+            )
+
+            if (self.world_size > 1) and (pad_amnt > 0):
+                string_nll = [x[0] for x in string_nll[:-pad_amnt]]
+            else:
+                # discard is_greedy
+                string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+
+    def _batch_scheduler(self, pos, n_reordered_requests):
+        sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
+        if sched in self.batch_sizes:
+            return self.batch_sizes[sched]
+        if (len(self.batch_sizes) > 1) and (
+            self.batch_sizes[sched - 1] == self.max_batch_size
+        ):
+            # if previous batch size is already maximal, skip recomputation
+            self.batch_sizes[sched] = self.max_batch_size
+            return self.batch_sizes[sched]
+        print(
+            f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
+        )
+        self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
+        print(f"Determined largest batch size: {self.batch_sizes[sched]}")
+        return self.batch_sizes[sched]
+
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+        override_bs: int = None,
+    ) -> List[Tuple[float, bool]]:
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations"""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-2] + req[-1][:-1]
+
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by="contexts"
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+            and self.logits_cache
+            else None,
+            group_fn=_lookup_one_token_cont,
+        )
+
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None
+        )
+
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            conts = []
+            encoder_attns = []
+
+            padding_len_inp = None
+            padding_len_cont = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                    inp = torch.tensor(
+                        (context_enc)[-self.max_length :],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+
+                    # build encoder attn masks
+                    encoder_attns.append(torch.ones_like(inp))
+
+                    cont = torch.tensor(
+                        (continuation_enc)[-self.max_length :],
+                        # TODO: left-shift these?
+                        # TODO: our code assumes we never end up truncating conts for either model type
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (contlen,) = cont.shape
+
+                    conts.append(cont)
+
+                    padding_len_cont = (
+                        max(padding_len_cont, contlen)
+                        if padding_len_cont is not None
+                        else contlen
+                    )
+
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps, padding_side="right"
+                )  # [batch, padding_len_inp]
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # TODO: left-pad encoder inps and mask?
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps
+                )  # [batch, padding_len_inp]
+                batched_conts = pad_and_concat(
+                    padding_len_cont, conts
+                )  # [batch, padding_len_cont]
+                batched_encoder_mask = pad_and_concat(
+                    padding_len_inp, encoder_attns
+                )  # [batch, padding_len_inp]
+                call_kwargs = {
+                    "attn_mask": batched_encoder_mask,
+                    "labels": batched_conts,
+                }
+
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps, **call_kwargs), dim=-1
+            )  # [batch, padding_length (inp or cont), vocab]
+
+            for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (logits.shape[0] - padding_len_inp)
+                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                    else None
+                )
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+
+                # check for one-token continuation cache hits.
+                # noop in case group_by != "contexts" or no cache hit and returns the
+                # original args. Otherwise, expands the logits batch dimension and yields each
+                # batch along with matching continuation tokens and prompt strings.
+                # logits -> [1, seq, vocab]
+                for request_str, cont_toks, logits in re_ord.get_cache(
+                    req_str=request_str,
+                    cxt_toks=ctx_tokens,
+                    cont_toks=cont_toks,
+                    logits=logits,
+                ):
+                    cont_toks = torch.tensor(
+                        cont_toks, dtype=torch.long, device=self.device
+                    ).unsqueeze(0)  # [1, seq]
+                    max_equal = (greedy_tokens == cont_toks).all()
+
+                    # Obtain log-probs at the corresponding continuation token indices
+                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                        -1
+                    )  # [1, seq]
+
+                    # Answer: (log prob, is-exact-match)
+                    answer = (float(logits.sum()), bool(max_equal))
+
+                    res.append(answer)
+
+                    self.cache_hook.add_partial("loglikelihood", request_str, answer)
+                    pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
+
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+
+        def _collate(req: Tuple[str, dict]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(req[0])
+            return -len(toks), req[0]
+
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests",
+        )
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
+        # for each different set of kwargs, we execute all requests, by batch.
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else adaptive_batch_size
+            if adaptive_batch_size is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto" and not adaptive_batch_size
+            else None
+        )
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        # group_fn=lambda x: x[1] -> x=(context, gen_kwargs)
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            sort_fn=_collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [until]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            # add EOS token to stop sequences
+            eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
+            if not until:
+                until = [eos]
+            else:
+                until.append(eos)
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            # set the max length in tokens of inputs ("context_enc")
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # max len for inputs = encoder's whole max_length
+                max_ctx_len = self.max_length
+
+            # encode, pad, and truncate contexts for this batch
+            context_enc, attn_masks = self.tok_batch_encode(
+                contexts,
+                left_truncate_len=max_ctx_len,
+                truncation=self.truncation,
+            )
+            context_enc = context_enc.to(self.device)
+            attn_masks = attn_masks.to(self.device)
+
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+
+            # perform batched generation
+            cont = self._model_generate(
+                context=context_enc,
+                attention_mask=attn_masks,
+                stop=until,
+                **kwargs,
+            )
+
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only LM
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    cont_toks = cont_toks[context_enc.shape[1] :]
+
+                s = self.tok_decode(cont_toks)
+
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+
+                res.append(s)
+
+                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+
+        return res
+
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        return self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=True
+        )
+
+    def get_model_info(self) -> dict:
+        """
+        Method to get Hugging Face model information for experiment reproducibility.
+        """
+
+        def get_model_num_params(model) -> int:
+            if hasattr(model, "num_parameters"):
+                return model.num_parameters()
+            if hasattr(model, "parameters"):
+                return sum(p.numel() for p in model.parameters())
+            else:
+                return -1
+
+        def get_model_dtype(model) -> str:
+            if hasattr(model, "dtype"):
+                return model.dtype
+            else:
+                return ""
+
+        def get_model_sha(pretrained: str, revision: str) -> str:
+            try:
+                model_info = HfApi().model_info(repo_id=pretrained, revision=revision)
+                return model_info.sha
+            except Exception as e:
+                eval_logger.warn(
+                    f"Failed to get model SHA for {pretrained} at revision {revision}. Error: {e}"
+                )
+                return ""
+
+        model_info = {
+            "model_num_parameters": get_model_num_params(self._model),
+            "model_dtype": get_model_dtype(self._model),
+            "model_revision": self.revision,
+            "model_sha": get_model_sha(self.pretrained, self.revision),
+        }
+        if self.peft:
+            model_info["peft_sha"] = get_model_sha(self.peft, self.revision)
+        if self.delta:
+            model_info["delta_sha"] = get_model_sha(self.delta, self.revision)
+        return model_info
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/nemo_lm.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/nemo_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb1aca1eec18a05725ffb29e15f633078cab699b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/nemo_lm.py
@@ -0,0 +1,537 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import pathlib
+from copy import deepcopy
+from typing import List, Literal
+
+import filelock
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import Collator
+from lm_eval.utils import (
+    eval_logger,
+    get_rolling_token_windows,
+    make_disjoint_window,
+    simple_parse_args_string,
+)
+
+
+def _patch_pretrained_cfg(
+    pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
+):
+    try:
+        import omegaconf
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+
+    omegaconf.OmegaConf.set_struct(pretrained_cfg, True)
+    with omegaconf.open_dict(pretrained_cfg):
+        attributes_to_update = {
+            "sequence_parallel": False,
+            "activations_checkpoint_granularity": None,
+            "activations_checkpoint_method": None,
+            "precision": trainer.precision,
+            "global_batch_size": None,
+            "tensor_model_parallel_size": tensor_model_parallel_size,
+            "pipeline_model_parallel_size": pipeline_model_parallel_size,
+            "apply_rope_fusion": False,
+        }
+        for name, value in attributes_to_update.items():
+            if hasattr(pretrained_cfg, name):
+                pretrained_cfg[name] = value
+    return pretrained_cfg
+
+
+def _get_target_from_class(target_class) -> str:
+    return f"{target_class.__module__}.{target_class.__name__}"
+
+
+def load_model(
+    model_path: str,
+    trainer,
+    tensor_model_parallel_size: int,
+    pipeline_model_parallel_size: int,
+) -> torch.nn.Module:
+    try:
+        from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import (
+            MegatronGPTModel,
+        )
+        from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+    model_path = pathlib.Path(model_path)
+
+    save_restore_connector = NLPSaveRestoreConnector()
+    if model_path.is_dir():
+        save_restore_connector.model_extracted_dir = model_path.as_posix()
+    pretrained_cfg = save_restore_connector.restore_from(
+        None, model_path.as_posix(), return_config=True, trainer=trainer
+    )
+    if not hasattr(pretrained_cfg, "target"):
+        pretrained_cfg["target"] = _get_target_from_class(MegatronGPTModel)
+
+    pretrained_cfg = _patch_pretrained_cfg(
+        pretrained_cfg,
+        trainer,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        pipeline_model_parallel_size=pipeline_model_parallel_size,
+    )
+
+    model_to_load_path = model_path
+    override_config = pretrained_cfg
+
+    module_name, class_name = override_config.target.rsplit(".", 1)
+    model_class = getattr(importlib.import_module(module_name), class_name)
+
+    # monkeypatch _build_tokenizer method to be process-safe
+    tokenizer_lock = filelock.FileLock(f"/tmp/{model_path.name}.tokenizer.lock")
+
+    def _synced_build_tokenizer(self):
+        with tokenizer_lock:
+            self._original_build_tokenizer()
+
+    model_class._original_build_tokenizer = model_class._build_tokenizer
+    model_class._build_tokenizer = _synced_build_tokenizer
+
+    model = model_class.restore_from(
+        restore_path=model_to_load_path.as_posix(),
+        trainer=trainer,
+        override_config_path=override_config,
+        save_restore_connector=save_restore_connector,
+        map_location=f"cuda:{trainer.local_rank}",
+    )
+
+    model.freeze()
+    model.training = False
+    try:
+        # Have to turn off activations_checkpoint_method for inference
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+    return model
+
+
+def setup_distributed_environment(trainer):
+    try:
+        from nemo.utils.app_state import AppState
+    except ModuleNotFoundError:
+        raise Exception(
+            "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+            "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+            "or installing nemo following https://github.com/NVIDIA/NeMo.",
+        )
+
+    def dummy():
+        return
+
+    if trainer.strategy.launcher is not None:
+        trainer.strategy.launcher.launch(dummy, trainer=trainer)
+    trainer.strategy.setup_environment()
+
+    app_state = AppState()
+
+    return app_state
+
+
+@register_model("nemo_lm")
+class NeMoLM(LM):
+    def __init__(
+        self,
+        path: str,
+        max_length: int = 4096,
+        batch_size: int = 1,
+        max_gen_toks: int = 256,
+        devices: int = 1,
+        num_nodes: int = 1,
+        tensor_model_parallel_size: int = 1,
+        pipeline_model_parallel_size: int = 1,
+        precision: Literal[
+            "16-mixed",
+            "bf16-mixed",
+            "32-true",
+            "64-true",
+            64,
+            32,
+            16,
+            "64",
+            "32",
+            "16",
+            "bf16",
+        ] = "bf16",
+        **kwargs,
+    ):
+        try:
+            from nemo.collections.nlp.modules.common.text_generation_utils import (
+                generate,
+            )
+            from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+            from pytorch_lightning.trainer.trainer import Trainer
+
+            self.generate = generate
+        except ModuleNotFoundError:
+            raise Exception(
+                "Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
+                "Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
+                "or installing nemo following https://github.com/NVIDIA/NeMo.",
+            )
+
+        super().__init__()
+
+        if (
+            tensor_model_parallel_size == 1
+            and pipeline_model_parallel_size == 1
+            and devices > 1
+        ):
+            eval_logger.info(
+                f"The number of data replicas for evaluation is {devices}."
+            )
+            eval_logger.info(f"The total number of devices is {devices}.")
+            eval_logger.info(
+                "No tensor parallelism or pipeline parallelism is applied."
+            )
+
+        elif tensor_model_parallel_size * pipeline_model_parallel_size == devices:
+            eval_logger.info(
+                f"Setting tensor parallelism to {tensor_model_parallel_size} and pipeline parallelism to {pipeline_model_parallel_size}."
+            )
+            eval_logger.info(f"The total number of devices is {devices}.")
+            eval_logger.info("No data parallelism is applied.")
+
+        else:
+            raise ValueError(
+                "Please set the product of tensor_model_parallel_size and pipeline_model_parallel_size"
+                "equal to the specified number of devices."
+            )
+
+        if num_nodes > 1:
+            raise ValueError(
+                "A number of nodes greater than 1 is not supported yet. Please set num_nodes as 1."
+            )
+
+        trainer = Trainer(
+            strategy=NLPDDPStrategy(),
+            devices=devices,
+            accelerator="gpu",
+            num_nodes=num_nodes,
+            precision=precision,
+            logger=False,
+            enable_checkpointing=False,
+            use_distributed_sampler=False,
+        )
+        # Modify the following flags only for data replication
+        if (
+            tensor_model_parallel_size == 1
+            and pipeline_model_parallel_size == 1
+            and devices > 1
+        ):
+            self._device = torch.device(f"cuda:{trainer.global_rank}")
+            self._rank = trainer.global_rank
+            self._world_size = trainer.world_size
+        self.model = load_model(
+            path,
+            trainer,
+            tensor_model_parallel_size=tensor_model_parallel_size,
+            pipeline_model_parallel_size=pipeline_model_parallel_size,
+        ).cuda()
+        self.tokenizer = self.model.tokenizer
+        self.app_state = setup_distributed_environment(trainer)
+
+        self._max_length = max_length
+        self._batch_size = int(batch_size)
+        self._max_gen_toks = max_gen_toks
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        args = simple_parse_args_string(arg_string)
+        if additional_config:
+            args["batch_size"] = additional_config.get("batch_size", 1)
+
+        return cls(**args)
+
+    @property
+    def eot_token_id(self):
+        try:
+            return self.tokenizer.eos_id
+        except AttributeError:
+            return None
+
+    @property
+    def max_length(self):
+        return self._max_length
+
+    @property
+    def max_gen_toks(self):
+        return self._max_gen_toks
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def accelerator(self):
+        return self._Accelerator(self.world_size)
+
+    class _Accelerator:
+        def __init__(self, world_size):
+            self.world_size = world_size
+
+        def wait_for_everyone(self):
+            torch.distributed.barrier()
+
+        def gather(self, local_tensor):
+            gathered_tensors = [
+                torch.zeros(1, dtype=local_tensor.dtype).cuda()
+                for _ in range(self.world_size)
+            ]
+            torch.distributed.all_gather(gathered_tensors, local_tensor)
+            return torch.cat(gathered_tensors)
+
+    def tok_encode(self, string: str):
+        return self.tokenizer.text_to_ids(string)
+
+    def tok_decode(self, tokens):
+        return self.tokenizer.ids_to_text(tokens)
+
+    def _encode_pair(self, context, continuation):
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tok_encode(context + continuation)
+        context_enc = self.tok_encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                # end of text as context
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
+                )
+            else:
+                context_enc, continuation_enc = self._encode_pair(context, continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        loglikelihoods = []
+
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
+            rolling_token_windows = list(
+                map(
+                    make_disjoint_window,
+                    get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length - 1,
+                        context_len=1,
+                    ),
+                )
+            )
+
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows,
+            )
+
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        res = []
+
+        def _collate(x):
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = Collator(requests, sort_fn=_collate)
+        chunks = re_ord.get_batched(n=self.batch_size, batch_fn=None)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            ctxlens = []
+            contlens = []
+
+            for _, context_enc, continuation_enc in chunk:
+                # Leave one token for generation. Tokens_to_generate = 0 breaks NeMo.
+                inp = (context_enc + continuation_enc)[-(self.max_length - 1) :]
+
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length - 1)
+                )
+                ctxlens.append(ctxlen)
+                contlens.append(len(continuation_enc))
+
+                inps.append(self.tok_decode(inp))
+
+            output = self.generate(
+                self.model,
+                inputs=inps,
+                tokens_to_generate=1,
+                min_tokens_to_generate=1,
+                compute_logprob=True,
+                all_probs=True,
+            )
+
+            batch_token_ids = np.asarray(output["token_ids"])[:, :-1]
+            batch_logprobs = output["logprob"][:, :-1]
+            batch_full_logprob = output["full_logprob"][:, :-1, :]
+
+            # Compute greedy tokens for entire batch rather than calling it with proper ctxlen for each sample.
+            # Additional tokens for each sample will be trimmed later.
+            min_ctxlen = min(ctxlens)
+
+            # Use min_ctxlen-1 instead of min_ctxlen since full_logprobs are not returns for the first token.
+            batch_greedy_tokens = (
+                torch.argmax(batch_full_logprob[:, min_ctxlen - 1 :, :], -1)
+                .cpu()
+                .numpy()
+            )
+
+            for token_ids, greedy_tokens, logprobs, ctxlen, contlen, (
+                cache_key,
+                _,
+                _,
+            ) in zip(
+                batch_token_ids,
+                batch_greedy_tokens,
+                batch_logprobs,
+                ctxlens,
+                contlens,
+                chunk,
+            ):
+                # Trim at contlen since shorter contexts in a batch will have more than one token generated.
+                # Use ctxlen-1 instead of ctxlen same as for full_logprob in batch_greedy_tokens calculation
+                logprobs = (logprobs[ctxlen - 1 :])[:contlen]
+                logprob = sum(logprobs).tolist()
+
+                continuation_tokens = (token_ids[ctxlen:])[:contlen]
+                len_diff = ctxlen - min_ctxlen
+                is_greedy = continuation_tokens == (greedy_tokens[len_diff:])[:contlen]
+                if not isinstance(is_greedy, bool):
+                    is_greedy = is_greedy.all()
+                answer = (logprob, is_greedy)
+
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+                res.append(answer)
+                pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
+
+    def generate_until(self, requests):
+        if not requests:
+            return []
+        res = []
+
+        def get_until(req_args):
+            until = req_args.get("until", [])
+            until = deepcopy(until)  # prevent from modifying req_args for cache_key
+            if self.tokenizer.ids_to_tokens([self.eot_token_id])[0] not in until:
+                until.append(self.tokenizer.ids_to_tokens([self.eot_token_id])[0])
+            return until
+
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+
+        re_ords = Collator(
+            [reg.args for reg in requests], sort_fn=_collate, group_by="gen_kwargs"
+        )
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            req_args = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = get_until(req_args)
+            max_gen_toks = req_args.get("max_gen_toks", self.max_gen_toks)
+
+            remaining_length = self.max_length - max_gen_toks
+            contexts = []
+            for context, _ in chunk:
+                encoded_context = self.tok_encode(context)
+                encoded_context = encoded_context[-remaining_length:]
+                contexts.append(self.tok_decode(encoded_context))
+
+            output = self.generate(
+                self.model,
+                inputs=contexts,
+                tokens_to_generate=max_gen_toks,
+                end_strings=until,
+                greedy=True,
+            )
+
+            answers = output["sentences"]
+
+            continuations = []
+            for context, answer in zip(contexts, answers):
+                continuations.append(answer[len(context) :])
+
+            for term in until:
+                continuations = [answer.split(term)[0] for answer in continuations]
+
+            for request, answer in zip(chunk, continuations):
+                self.cache_hook.add_partial("greedy_until", request, answer)
+                res.append(answer)
+
+        return re_ords.get_original(res)
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/neuralmagic.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/neuralmagic.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c16b06d50b2b8117cf0b6d6b33d9d4a2b681923
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/neuralmagic.py
@@ -0,0 +1,426 @@
+import copy
+from typing import List, Optional, Tuple, Union
+
+import numpy
+import transformers
+from tqdm import tqdm
+
+import lm_eval.models.utils
+from lm_eval import utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+
+
+eval_logger = utils.eval_logger
+
+
+@register_model("sparseml")
+class SparseMLLM(HFLM):
+    """
+    SparseML is an open-source model optimization toolkit that enables you to create
+    inference-optimized sparse models using pruning, quantization, and distillation
+    algorithms. Models optimized with SparseML can then be exported to the ONNX format and
+    deployed with DeepSparse for GPU-class performance on CPU hardware.
+
+    This class is a wrapper around the HuggingFace LM class to enable SparseML
+    integration with the lm-evaluation-harness.
+    """
+
+    def _create_model(
+        self,
+        pretrained: str,
+        revision: Optional[str] = "main",
+        dtype: Optional[str] = "auto",
+        trust_remote_code: Optional[bool] = False,
+        **kwargs,
+    ) -> None:
+        try:
+            from sparseml.transformers import SparseAutoModelForCausalLM
+        except ModuleNotFoundError:
+            raise Exception(
+                "Package `sparseml` is not installed. "
+                "Please install it via `pip install sparseml[transformers]`"
+            )
+
+        model_kwargs = kwargs if kwargs else {}
+
+        if "device_map" not in model_kwargs:
+            # set a device_map to initialize model on the right GPU.
+            # this is needed because it seems that the default behavior
+            # for quantized models now seems to be device_map="auto"
+            # which breaks data-parallel mode.
+            if hasattr(self, "accelerator"):
+                model_kwargs.update(
+                    {"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
+                )
+            else:
+                model_kwargs.update({"device_map": {"": str(self.device)}})
+
+        relevant_kwarg_names = [
+            "offload_folder",
+            "device_map",
+        ]
+        relevant_kwargs = {
+            k: v for k, v in model_kwargs.items() if k in relevant_kwarg_names
+        }
+
+        # Log the difference between model_kwargs and relevant_kwargs so we can see
+        # what is being ignored
+        ignored_kwargs = {}
+        for k, v in model_kwargs.items():
+            if k not in relevant_kwargs.keys():
+                ignored_kwargs[k] = v
+        eval_logger.warning(
+            f"The sparseml integration is ignoring the following kwargs that are specified: {ignored_kwargs}"
+        )
+
+        model = SparseAutoModelForCausalLM.from_pretrained(
+            pretrained,
+            revision=revision,
+            torch_dtype=lm_eval.models.utils.get_dtype(dtype),
+            trust_remote_code=trust_remote_code,
+            **relevant_kwargs,
+        )
+        self._model = model
+
+    def _get_config(self, pretrained: str, **kwargs) -> None:
+        try:
+            from sparseml.transformers import SparseAutoConfig
+        except ModuleNotFoundError:
+            raise Exception(
+                "Package `sparseml` is not installed. "
+                "Please install it via `pip install sparseml[transformers]`"
+            )
+
+        self._config = SparseAutoConfig.from_pretrained(
+            pretrained_model_name_or_path=pretrained, **kwargs
+        )
+
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ],
+        **kwargs,
+    ) -> None:
+        try:
+            from sparseml.transformers import SparseAutoTokenizer
+        except ModuleNotFoundError:
+            raise Exception(
+                "Package `sparseml` is not installed. "
+                "Please install it via `pip install sparseml[transformers]`"
+            )
+
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                self.tokenizer = SparseAutoTokenizer.from_pretrained(
+                    tokenizer,
+                    **kwargs,
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                self.tokenizer = tokenizer
+        else:
+            # Get tokenizer based on 'pretrained'
+            if isinstance(pretrained, str):
+                model_name = pretrained
+            else:
+                # get the HF hub name via accessor on model
+                model_name = self.model.name_or_path
+            self.tokenizer = SparseAutoTokenizer.from_pretrained(
+                model_name,
+                **kwargs,
+            )
+        return None
+
+
+@register_model("deepsparse")
+class DeepSparseLM(LM):
+    """
+    Wrapper around DeepSparse, a sparsity-aware deep learning
+    inference runtime for CPUs, to make it compatible with the
+    lm-evaluation-harness.
+    """
+
+    _DEFAULT_MAX_LENGTH = 2048
+
+    def __init__(
+        self,
+        pretrained: str,
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ] = None,
+        batch_size: Optional[Union[int, str]] = 1,
+        max_gen_toks: Optional[int] = 256,
+        max_length: Optional[int] = None,
+    ):
+        super().__init__()
+
+        try:
+            import deepsparse
+        except ModuleNotFoundError:
+            raise Exception(
+                "Package `deepsparse` is not installed. "
+                "Please install it via `pip install deepsparse[transformers]`"
+            )
+
+        if isinstance(batch_size, str) and not batch_size.isdigit():
+            eval_logger.warning(
+                f"batch_size={batch_size} is not valid for deepsparse because it is not an integer. "
+                "Ignoring and using the default of 1."
+            )
+            batch_size = 1
+
+        self.batch_size = int(batch_size)
+        self._max_length = max_length if max_length else self._DEFAULT_MAX_LENGTH
+        self._max_gen_toks = max_gen_toks
+        self.batch_sizes = {}
+
+        # Initialize new model and tokenizer instances
+        self.model = deepsparse.TextGeneration(
+            model_path=pretrained,
+            sequence_length=self._max_length,
+            batch_size=batch_size,
+        )
+        self.tokenizer = tokenizer if tokenizer else self.model.tokenizer
+        self.config = self.model.config
+
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string)
+
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self) -> int:
+        return self._max_length
+
+    @property
+    def max_gen_toks(self) -> int:
+        return self._max_gen_toks
+
+    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+        """
+        Copied directly from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                raise NotImplementedError(
+                    "Implementing empty context is not supported yet"
+                )
+            context_enc, continuation_enc = self._encode_pair(context, continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+    ) -> List[Tuple[float, bool]]:
+        """
+        The function to compute the loglikelihood of the continuation
+        tokens given the context tokens.
+
+        This function is an adapted version of the original function from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
+        res = []
+
+        def _collate(x):
+            """Defines the key for the sorted method"""
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        for chunk in tqdm(
+            list(lm_eval.models.utils.chunks(re_ord.get_reordered(), self.batch_size)),
+            disable=disable_tqdm,
+        ):
+            batch_inp = []
+            batch_cache_key = []
+            batch_continuation_enc = []
+            # len(chunk) is the batch_size
+            for cache_key, context_enc, continuation_enc in chunk:
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice # noqa: E501
+
+                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+
+                batch_inp.append(self.tokenizer.decode(inp))
+                batch_cache_key.append(cache_key)
+                batch_continuation_enc.append(continuation_enc)
+
+            response = self.model(
+                prompt=batch_inp,
+                max_new_tokens=0,
+                output_scores=True,
+                include_prompt_logits=True,
+            )
+
+            for resp, continuation_enc, cache_key in zip(
+                response.generations, batch_continuation_enc, batch_cache_key
+            ):
+                # (seq_len, vocab_size)
+                multi_scores = resp.score
+
+                from deepsparse.utils.data import numpy_log_softmax
+
+                # (seq_len, vocab_size) but with softmax applied
+                multi_logits = numpy_log_softmax(multi_scores, axis=1)
+                # toss out the context half of the sequence
+                # (cont_len, vocab_size)
+                continuation_multi_logits = multi_logits[-len(continuation_enc) :]
+
+                # pick out the logits for the continuation tokens
+                # (cont_len,)
+                continuation_logits = continuation_multi_logits[
+                    numpy.arange(len(continuation_enc)), continuation_enc
+                ]
+                # check if the tokens generated greedly are the same
+                # as the expected continuation
+                greedy_tokens = continuation_multi_logits.argmax(axis=1)
+                max_equal = greedy_tokens.tolist() == continuation_enc
+
+                # Answer: (log prob, is-exact-match)
+                answer = (float(continuation_logits.sum()), bool(max_equal))
+
+                res.append(answer)
+
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+        return re_ord.get_original(res)
+
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        raise NotImplementedError(
+            "The method not required by any of our current task integrations so far"
+        )
+
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        """
+        The function to generate a certain number of new tokens
+        given a context.
+
+        This function is an adapted version of the original function from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py
+        """
+        if not requests:
+            return []
+        res = []
+        requests = [req.args for req in requests]
+
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        def sameuntil_chunks(xs, size):
+            ret = []
+            lastuntil = xs[0][1]
+            for x in xs:
+                if len(ret) >= size or x[1] != lastuntil:
+                    yield ret, lastuntil
+                    ret = []
+                    lastuntil = x[1]
+                ret.append(x)
+
+            if ret:
+                yield ret, lastuntil
+
+        pbar = tqdm(total=len(requests))
+        for chunk, request_args in tqdm(
+            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
+        ):
+            inps = []
+
+            # make a deepcopy since we are changing arguments
+            request_args = copy.deepcopy(request_args)
+
+            self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
+
+            for context, _ in chunk:
+                # add context (prompts) to the list
+                inps.append(context)
+
+            until = request_args.pop("until", ["<|endoftext|>"])
+            request_args.pop("do_sample", None)
+            request_args["temperature"] = request_args.get("temperature", 0)
+
+            # run inference (generate max_gen_toks tokens)
+            out = self.model(
+                sequences=inps,
+                max_new_tokens=self.max_gen_toks - 1,
+                stop=until,
+                **request_args,
+            )
+
+            for resp, (context, args_) in zip(out.generations, chunk):
+                text = resp.text
+                until_ = until
+                # split the text at the first occurrence of any of the until tokens
+                for term in until_:
+                    if len(term) > 0:
+                        text = text.split(term)[0]
+
+                res.append(text)
+
+                self.cache_hook.add_partial(
+                    "generate_until", (context, {"until": until_}), text
+                )
+                pbar.update(1)
+
+        pbar.close()
+
+        return re_ord.get_original(res)
+
+    def _encode_pair(
+        self, context: str, continuation: str
+    ) -> Tuple[List[int], List[int]]:
+        """
+        Copied directly from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tok_encode(context + continuation)
+        context_enc = self.tok_encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/openai_completions.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/openai_completions.py
new file mode 100644
index 0000000000000000000000000000000000000000..26dc93d68f469d69e9d165b6a3a0ba87a3055780
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/openai_completions.py
@@ -0,0 +1,222 @@
+import os
+from functools import cached_property
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from lm_eval.api.registry import register_model
+from lm_eval.models.api_models import TemplateAPI
+from lm_eval.utils import eval_logger
+
+
+@register_model("local-completions")
+class LocalCompletionsAPI(TemplateAPI):
+    def __init__(
+        self,
+        base_url=None,
+        tokenizer_backend="huggingface",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+
+    def _create_payload(
+        self,
+        messages: Union[List[List[int]], List[dict], List[str], str],
+        generate=False,
+        gen_kwargs: Optional[dict] = None,
+        seed: int = 1234,
+        **kwargs,
+    ) -> dict:
+        if generate:
+            gen_kwargs.pop("do_sample", False)
+            max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+            temperature = gen_kwargs.pop("temperature", 0)
+            stop = gen_kwargs.pop("until", ["<|endoftext|>"])
+            return {
+                "prompt": messages,
+                "model": self.model,
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+                "stop": stop,
+                "seed": seed,
+                **gen_kwargs,
+            }
+        else:
+            return {
+                "model": self.model,
+                "prompt": messages,
+                "temperature": 0,
+                "max_tokens": 1,
+                "logprobs": 1,
+                "seed": seed,
+                "echo": True,
+            }
+
+    @staticmethod
+    def parse_logprobs(
+        outputs: Union[Dict, List[Dict]],
+        tokens: List[List[int]] = None,
+        ctxlens: List[int] = None,
+        **kwargs,
+    ) -> List[Tuple[float, bool]]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choice, ctxlen in zip(out["choices"], ctxlens):
+                assert ctxlen > 0, "Context length must be greater than 0"
+                logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
+                tokens = choice["logprobs"]["token_logprobs"][ctxlen:-1]
+                top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1]
+                is_greedy = True
+                for tok, top in zip(tokens, top_logprobs):
+                    if tok != max(top, key=top.get):
+                        is_greedy = False
+                        break
+                res.append((logprobs, is_greedy))
+        return res
+
+    @staticmethod
+    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["choices"]:
+                res.append(choices["text"])
+        return res
+
+    @property
+    def api_key(self):
+        return os.environ.get("OPENAI_API_KEY", "")
+
+
+@register_model("local-chat-completions")
+class LocalChatCompletion(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url=None,
+        tokenizer_backend=None,
+        tokenized_requests=False,
+        **kwargs,
+    ):
+        eval_logger.warning(
+            "chat-completions endpoint requires the `--apply_chat_template` flag."
+        )
+        super().__init__(
+            base_url=base_url,
+            tokenizer_backend=tokenizer_backend,
+            tokenized_requests=tokenized_requests,
+            **kwargs,
+        )
+        if self._batch_size > 1:
+            eval_logger.warning(
+                "Chat completions does not support batching. Defaulting to batch size 1."
+            )
+            self._batch_size = 1
+
+    def _create_payload(
+        self,
+        messages: List[Dict],
+        generate=False,
+        gen_kwargs: dict = None,
+        seed=1234,
+        **kwargs,
+    ) -> dict:
+        gen_kwargs.pop("do_sample", False)
+        max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+        temperature = gen_kwargs.pop("temperature", 0)
+        stop = gen_kwargs.pop("until", ["<|endoftext|>"])
+        if not isinstance(stop, (list, tuple)):
+            stop = [stop]
+        return {
+            "messages": messages,
+            "model": self.model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "stop": stop[:4],
+            "seed": seed,
+            **gen_kwargs,
+        }
+
+    @staticmethod
+    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            for choices in out["choices"]:
+                res.append(choices["message"]["content"])
+        return res
+
+    def tok_encode(
+        self,
+        string: Union[str, Any],
+        left_truncate_len=None,
+        add_special_tokens=None,
+        **kwargs,
+    ) -> Union[List[str], List[int], Any]:
+        return string
+
+    def loglikelihood(self, requests, **kwargs):
+        raise NotImplementedError(
+            "Loglikelihood is not supported for chat completions. Consider using the completions API instead."
+        )
+
+
+@register_model(
+    "openai-completions",
+)
+class OpenAICompletionsAPI(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url="https://api.openai.com/v1/completions",
+        tokenizer_backend="tiktoken",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("OPENAI_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the OPENAI_API_KEY environment variable."
+            )
+        return key
+
+    def loglikelihood(self, requests, **kwargs):
+        assert (
+            self.model != "gpt-3.5-turbo"
+        ), "Loglikelihood is not supported for gpt-3.5-turbo"
+        return super().loglikelihood(requests, **kwargs)
+
+
+@register_model("openai-chat-completions")
+class OpenAIChatCompletion(LocalChatCompletion):
+    def __init__(
+        self,
+        base_url="https://api.openai.com/v1/chat/completions",
+        tokenizer_backend=None,
+        tokenized_requests=False,
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url,
+            tokenizer_backend=tokenizer_backend,
+            tokenized_requests=tokenized_requests,
+            **kwargs,
+        )
+
+    @cached_property
+    def api_key(self):
+        """Override this property to return the API key for the API request."""
+        key = os.environ.get("OPENAI_API_KEY", None)
+        if key is None:
+            raise ValueError(
+                "API key not found. Please set the OPENAI_API_KEY environment variable."
+            )
+        return key
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/optimum_lm.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/optimum_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d44abdaca859fa79bd1beed789c96ad2c22ca9
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/optimum_lm.py
@@ -0,0 +1,87 @@
+import json
+from importlib.util import find_spec
+from pathlib import Path
+
+from lm_eval import utils
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+
+
+eval_logger = utils.eval_logger
+
+
+@register_model("openvino")
+class OptimumLM(HFLM):
+    """
+    Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
+    OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
+    Intel® architectures using OpenVINO™ runtime.
+
+    To use an OpenVINO config, use `--model_args ov_config` to point to a json file with an OpenVINO config:
+    `lm_eval --model openvino --model_args pretrained=gpt2,ov_config=config.json --task lambada_openai`
+    Example json file contents: {"INFERENCE_PRECISION_HINT": "f32", "CACHE_DIR": "model_cache"}
+    """
+
+    def __init__(
+        self,
+        device="cpu",
+        **kwargs,
+    ) -> None:
+        if "backend" in kwargs:
+            # optimum currently only supports causal models
+            assert (
+                kwargs["backend"] == "causal"
+            ), "Currently, only OVModelForCausalLM is supported."
+
+        self.openvino_device = device
+
+        super().__init__(
+            device=self.openvino_device,
+            backend=kwargs.pop("backend", "causal"),
+            **kwargs,
+        )
+
+    def _create_model(
+        self,
+        pretrained: str,
+        revision="main",
+        dtype="auto",
+        trust_remote_code=False,
+        **kwargs,
+    ) -> None:
+        if not find_spec("optimum"):
+            raise Exception(
+                "package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
+            )
+        else:
+            from optimum.intel.openvino import OVModelForCausalLM
+
+        model_kwargs = kwargs if kwargs else {}
+        if "ov_config" in model_kwargs:
+            if not Path(model_kwargs["ov_config"]).exists():
+                raise ValueError(
+                    "ov_config should point to a .json file containing an OpenVINO config"
+                )
+            with open(model_kwargs["ov_config"]) as f:
+                model_kwargs["ov_config"] = json.load(f)
+                eval_logger.info(
+                    f"Using custom OpenVINO config: {model_kwargs['ov_config']}"
+                )
+
+        else:
+            model_kwargs["ov_config"] = {}
+        model_kwargs["ov_config"].setdefault("CACHE_DIR", "")
+        model_file = Path(pretrained) / "openvino_model.xml"
+        if model_file.exists():
+            export = False
+        else:
+            export = True
+
+        self._model = OVModelForCausalLM.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            export=export,
+            device=self.openvino_device.upper(),
+            **model_kwargs,
+        )
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/textsynth.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/textsynth.py
new file mode 100644
index 0000000000000000000000000000000000000000..a14f6287b6f11b21cfc69ca471bcbe99a631be12
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/textsynth.py
@@ -0,0 +1,172 @@
+"""TextSynth API
+Implementation provided by Fabrice Bellard:
+    https://github.com/EleutherAI/lm-evaluation-harness/issues/295
+
+In order to use the API, you must have a valid TextSynth account and
+enough credits.
+
+Example usage:
+
+    python main.py --model textsynth --model_args engine=gptj_6B --no_cache --tasks piqa
+
+Homepage: https://textsynth.com/index.html
+"""
+
+import logging
+import os
+
+import requests as _requests
+from tqdm import tqdm
+
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.utils import retry_on_specific_exceptions
+
+
+logger = logging.getLogger(__name__)
+
+
+def textsynth_completion(**kwargs):
+    """Query TextSynth API for completion.
+    Retry with back-off until they respond.
+    """
+
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        import traceback
+
+        traceback.print_exc()
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[_requests.exceptions.RequestException],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        return _requests.post(**kwargs)
+
+    return completion()
+
+
+@register_model("textsynth")
+class TextSynthLM(LM):
+    def __init__(self, engine, truncate: bool = False, **kwargs) -> None:
+        """
+        :param engine: str
+            TextSynth API engine (e.g. `gptj_6B`)
+        :param truncate: bool
+            Truncate input if too long (if False and input is too long, throw error)
+        """
+        super().__init__()
+
+        self.engine = engine
+        self.truncate = truncate
+        self.api_url = "https://api.textsynth.com"
+        # Read from environment variable TEXTSYNTH_API_SECRET_KEY
+        self.api_key = os.environ["TEXTSYNTH_API_SECRET_KEY"]
+
+    @property
+    def eot_token_id(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
+        raise NotImplementedError()
+
+    @property
+    def max_length(self) -> int:
+        # NOTE: Turn on truncation to avoid errors on long inputs.
+        return 2048
+
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
+        raise NotImplementedError()
+
+    @property
+    def device(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
+        raise NotImplementedError()
+
+    def tok_encode(self, string: str):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
+        raise NotImplementedError()
+
+    def tok_decode(self, tokens):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
+        raise NotImplementedError()
+
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        res = []
+        for context, continuation in tqdm(requests, disable=disable_tqdm):
+            response = textsynth_completion(
+                url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
+                headers={"Authorization": "Bearer " + self.api_key},
+                json={"context": context, "continuation": continuation},
+            )
+            resp = response.json()
+            if "logprob" in resp:
+                logprob = resp["logprob"]
+                is_greedy = resp["is_greedy"]
+                res.append((logprob, is_greedy))
+
+                self.cache_hook.add_partial(
+                    "loglikelihood", (context, continuation), (logprob, is_greedy)
+                )
+            else:
+                logger.error(
+                    f"The following response does not contain `logprobs`. Got:\n{resp}"
+                )
+                assert False
+        return res
+
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        # TODO: The TextSynth API does not support tokenized inputs so we cannot
+        # manually partition long contexts into smaller rolling windows as
+        # done for other models derived from `BaseLM`. Override this method
+        # with a windowing scheme that works for direct string inputs.
+        raise NotImplementedError(
+            "`loglikelihood_rolling` is currently not supported due to lack of "
+            "input tokenization support from TextSynth."
+        )
+
+    def generate_until(self, requests, disable_tqdm: bool = False):
+        if not requests:
+            return []
+
+        res = []
+        for request in tqdm(requests, disable=disable_tqdm):
+            inp = request[0]
+            request_args = request[1]
+            until = request_args["until"]
+            response = textsynth_completion(
+                url=self.api_url + "/v1/engines/" + self.engine + "/completions",
+                headers={"Authorization": "Bearer " + self.api_key},
+                json={
+                    "prompt": inp,
+                    "max_tokens": self.max_gen_toks,
+                    "top_k": 1,
+                    "stop": until,
+                },
+            )
+            resp = response.json()
+            if "text" in resp:
+                s = resp["text"]
+                res.append(s)
+
+                self.cache_hook.add_partial("generate_until", (inp, request_args), s)
+            else:
+                logger.error(
+                    "The following response does not contain generated `text`. "
+                    "Got:\n{resp}"
+                )
+                assert False
+        return res
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/models/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a81e5deca280f4e48b584a4eac78fb44d1feda2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/lm_eval/models/utils.py
@@ -0,0 +1,666 @@
+import collections
+import fnmatch
+import gc
+import itertools
+import time
+from functools import wraps
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import transformers
+
+from lm_eval.utils import eval_logger
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+    from transformers.configuration_utils import PretrainedConfig
+
+
+def chunks(iter, n: int = 0, fn=None):
+    """
+    Divides an iterable into chunks of specified size or based on a given function.
+    Useful for batching
+
+    Parameters:
+    - iter: The input iterable to be divided into chunks.
+    - n: An integer representing the size of each chunk. Default is 0.
+    - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+
+    Returns:
+    An iterator that yields chunks of the input iterable.
+
+    Example usage:
+    ```
+    data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    for chunk in chunks(data, 3):
+        print(chunk)
+    ```
+    Output:
+    ```
+    [1, 2, 3]
+    [4, 5, 6]
+    [7, 8, 9]
+    [10]
+    ```
+    """
+    arr = []
+    for i, x in enumerate(iter):
+        arr.append(x)
+        if len(arr) == (fn(i, iter) if fn else n):
+            yield arr
+            arr = []
+
+    if arr:
+        yield arr
+
+
+class MultiChoice:
+    def __init__(self, choices) -> None:
+        self.choices = choices
+
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values) -> bool:
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                eval_logger.info("Available tasks to choose:")
+                for choice in self.choices:
+                    eval_logger.info(f"  - {choice}")
+                raise ValueError("'{}' is not in task list".format(value))
+        return True
+
+    def __iter__(self) -> Iterator:
+        for choice in self.choices:
+            yield choice
+
+
+class Grouper:
+    """
+    takes an array `arr` and function `fn` and returns a dictionary
+    with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
+    objects in `arr` satisfying `key == fn(ob)`.
+    """
+
+    def __init__(self, arr, fn) -> None:
+        # self.orig_arr = arr
+        self.size = len(arr)
+        arr = list(enumerate(arr))
+
+        def group_return_dict(arr, fn):
+            res = collections.defaultdict(list)
+
+            for ob in arr:
+                res[fn(ob)].append(ob)
+            return res
+
+        arr = group_return_dict(arr, lambda x: fn(x[1]))
+
+        # self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
+        self.arr = arr
+        self._grouped = None
+
+    def get_grouped(self):
+        # return the contents but not indices for our grouped dict.
+        if self._grouped:
+            return self._grouped
+        grouped = {}
+        for key in self.arr.keys():
+            # drop the index from each element of self.arr
+            grouped[key] = [y[1] for y in self.arr[key]]
+        self._grouped = grouped
+        return grouped
+
+    def get_original(self, grouped_dict):
+        # take in a grouped dictionary with e.g. results for each key listed
+        # in the same order as the instances in `self.arr`, and
+        # return the results in the same (single list) order as `self.orig_arr`.
+        res = [None] * self.size
+        cov = [False] * self.size
+        # orig = [None] * self.size
+
+        assert grouped_dict.keys() == self.arr.keys()
+
+        for key in grouped_dict.keys():
+            for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
+                res[ind] = v
+                cov[ind] = True
+                # orig[ind] = _
+
+        assert all(cov)
+        # assert orig == self.orig_arr
+
+        return res
+
+
+def pad_and_concat(
+    max_length: int,
+    tensors: List[torch.Tensor],
+    padding_side: Literal["right", "left"] = "right",
+):
+    """
+    Method for padding a list of tensors given the maximum tensor
+    length in the batch. Used for batching inputs and continuations in
+    seq2seq models.
+    """
+    assert (
+        padding_side == "left" or padding_side == "right"
+    ), f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
+
+    for i, tensor in enumerate(tensors):
+        if len(tensor.shape) == 2:
+            tensor = tensor.squeeze(0)  # squeeze, in case passed [1, seq] size
+        tensor_len = tensor.shape[0]
+        if tensor_len < max_length:
+            if padding_side == "right":
+                # right-pad
+                tensors[i] = torch.cat(
+                    [
+                        tensor,  # [seq]
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+            else:
+                # left-pad
+                tensors[i] = torch.cat(
+                    [
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                        tensor,  # [seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+        else:
+            tensors[i] = tensor.unsqueeze(0)
+
+    return torch.cat(tensors, dim=0)
+
+
+def clear_torch_cache() -> None:
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
+    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
+    if isinstance(dtype, str) and dtype != "auto":
+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
+        _torch_dtype = getattr(torch, dtype)
+    else:
+        _torch_dtype = dtype
+    return _torch_dtype
+
+
+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence."""
+
+    def __init__(
+        self,
+        sequence: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        initial_decoder_input_length: int,
+        batch_size: int,
+    ) -> None:
+        self.initial_decoder_input_length = initial_decoder_input_length
+        self.done_tracker = [False] * batch_size
+        self.sequence = sequence
+        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
+        # print(sequence, self.sequence_ids)
+        # we look back for 2 more tokens than it takes to encode our stop sequence
+        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
+        # and we don't want to mistakenly not stop a generation because our
+        # (string) stop sequence was output in a different tokenization
+
+        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
+        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
+        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
+        self.sequence_id_len = len(self.sequence_ids) + 2
+        self.tokenizer = tokenizer
+
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
+
+        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
+
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+
+        for i, done in enumerate(self.done_tracker):
+            if not done:
+                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
+
+
+def stop_sequences_criteria(
+    tokenizer: transformers.PreTrainedTokenizer,
+    stop_sequences: List[str],
+    initial_decoder_input_length: int,
+    batch_size: int,
+) -> transformers.StoppingCriteriaList:
+    return transformers.StoppingCriteriaList(
+        [
+            *[
+                MultiTokenEOSCriteria(
+                    sequence, tokenizer, initial_decoder_input_length, batch_size
+                )
+                for sequence in stop_sequences
+            ],
+        ]
+    )
+
+
+def undistribute(iterable):
+    """
+    Undoes https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.distribute .
+
+    Re-interleaves results that have been split using more_itertools.distribute:
+        >>> group_1, group_2 = distribute(2, [1, 2, 3, 4, 5, 6])
+        >>> list(group_1)
+        [1, 3, 5]
+        >>> list(group_2)
+        [2, 4, 6]
+        >>> undistribute([group_1, group_2])
+        [1, 2, 3, 4, 5, 6]
+
+    Handles non-uniform component lengths:
+
+        >>> children = distribute(3, [1, 2, 3, 4, 5, 6, 7])
+        >>> [list(c) for c in children]
+        [[1, 4, 7], [2, 5], [3, 6]]
+        >>> undistribute(children)
+        [1, 2, 3, 4, 5, 6, 7]
+
+    Also handles when some iterables are empty:
+
+        >>> children = distribute(5, [1, 2, 3])
+        >>> [list(c) for c in children]
+        [[1], [2], [3], [], []]
+        >>> undistribute(children)
+        [1, 2, 3]
+
+    """
+
+    return [
+        x
+        for x in itertools.chain.from_iterable(
+            itertools.zip_longest(*[list(x) for x in iterable])
+        )
+        if x is not None
+    ]
+
+
+def retry_on_specific_exceptions(
+    on_exceptions: List[Type[Exception]],
+    max_retries: Optional[int] = None,
+    backoff_time: float = 3.0,
+    backoff_multiplier: float = 1.5,
+    on_exception_callback: Optional[Callable[[Exception, float], Any]] = None,
+):
+    """Retry on an LLM Provider's rate limit error with exponential backoff
+    For example, to use for OpenAI, do the following:
+    ```
+    from openai import RateLimitError
+
+    # Recommend specifying max_retries to avoid infinite loops!
+    @retry_on_specific_exceptions([RateLimitError], max_retries=3)
+    def completion(...):
+        # Wrap OpenAI completion function here
+        ...
+    ```
+    """
+
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            sleep_time = backoff_time
+            attempt = 0
+            while max_retries is None or attempt < max_retries:
+                try:
+                    return func(*args, **kwargs)
+                except tuple(on_exceptions) as e:
+                    if on_exception_callback is not None:
+                        on_exception_callback(e, sleep_time)
+                    time.sleep(sleep_time)
+                    sleep_time *= backoff_multiplier
+                    attempt += 1
+
+        return wrapper
+
+    return decorator
+
+
+class Collator:
+    """
+    A class for reordering and batching elements of an array.
+
+    This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data.
+
+    Objects of this class have the group_by attribute which determines the method for grouping
+    the data while batching it. Three options include "gen_kwargs", "contexts", or None:
+        If group_by == "gen_kwargs" then requests will be grouped by gen_kwargs
+        If group_by == "contexts" then requests will be grouped by context + cont[:-1]
+        If None then requests will just be reordered by length descending.
+    """
+
+    def __init__(
+        self,
+        arr: List,
+        sort_fn: Callable = lambda x: x,
+        group_fn: Callable = lambda x: x[1],
+        group_by: Union[Literal["gen_kwargs", "contexts"], None] = None,
+    ) -> None:
+        self._group_by = group_by
+        # 0 indices are enumerated indices. Apply functions to original arr.
+        self._sort_fn = lambda x: sort_fn(x[1])
+        self._group_fn = lambda x: group_fn(x[1])
+        self._reorder_indices: List = []
+        self._size = len(arr)
+        self._arr_with_indices: Union[Dict, Tuple[Tuple[int, Any], ...]] = tuple(
+            enumerate(arr)
+        )  # [indices, (arr)]
+        if self._group_by == "contexts":
+            self._group_by_context()
+        elif self._group_by == "gen_kwargs":
+            self._group_by_index()
+
+    def _group_by_index(self) -> None:
+        """Group the elements of a list based on their indices."""
+        self._arr_with_indices = self.group(
+            self._arr_with_indices, fn=self._group_fn, group_by="gen_kwargs"
+        )
+
+    def _group_by_context(self) -> None:
+        """Group the array with indices by context."""
+        self._arr_with_indices = self.group(
+            self._arr_with_indices, fn=self._group_fn, group_by="contexts"
+        )
+
+    def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
+        """
+        Generates and yields batches from the reordered array. The method of grouping and batching
+        depends on the parameter `group_by`.
+        If `group_by` is set to "gen_kwargs", it will batch the
+        re-ordered values with same gen_kwargs for each batch.
+        If `group_by` is "contexts", it caches the requests by context before batching.
+        If `group_by` is neither "gen_kwargs" nor "contexts", it yields the reordered array
+
+        Parameters:
+        - n (int): The size of each batch. Defaults to 1.
+        - batch_fn ([Callable[[int, Iterable], int]] | None): A function to determine the size of
+          each batch. Optional, defaults to None.
+
+        Returns:
+        Iterator: An iterator over batches of reordered elements grouped as per the `group_by`
+                  attribute.
+
+        Yields:
+        List of batched elements according to the `group_by` attribute.
+        """
+        if self._group_by == "gen_kwargs":
+            for (
+                key,
+                values,
+            ) in self._arr_with_indices.items():  # type: ignore
+                values = self._reorder(values)
+                batch = self.get_chunks(values, n=n, fn=batch_fn)
+                yield from batch
+        elif self._group_by == "contexts":
+            # Get one sample from each key
+            values = self._reorder(
+                [value[0] for value in self._arr_with_indices.values()]
+            )
+            batch = self.get_chunks(values, n=n, fn=batch_fn)
+            yield from batch
+        else:
+            values = self._reorder(self._arr_with_indices)  # type: ignore
+            batch = self.get_chunks(values, n=n, fn=batch_fn)
+            yield from batch
+
+    def get_cache(
+        self,
+        req_str: Tuple[str, str] = None,
+        cxt_toks: List[int] = None,
+        cont_toks: List[int] = None,
+        logits: torch.Tensor = None,
+    ) -> Iterator[Tuple[Tuple[str, str], List[int], torch.Tensor]]:
+        """
+        Retrieves cached single-token continuations and their associated arguments, updating indices as necessary.
+
+        The behavior of this function varies depending on how the `group_by` attribute is set:
+
+        - When `group_by` is "contexts":
+            The function identifies single-token continuations by checking for keys that equate to
+            [context+continuation][-1] and logs the indices for re-ordering.
+            In this mode, this function can work in two scenarios:
+
+            1. Cache Hit - Single Match:
+                If a single matching context-continuation pair is found in the cache,
+                the function yields the original arguments.
+
+            2. Cache Hit - Multiple Matches:
+                If multiple matching context-continuation pairs are found in the cache,
+                the function expands the logits batch dimension to match the number of cache hits.
+                It updates the original requests and continuation tokens.
+
+        - When `group_by` is not set to "contexts":
+            This method yields the original arguments, logits and continuation tokens,
+            without checking for one-token continuations.
+
+        Parameters:
+        - req_str (tuple[str, str]): Original strings used for CachingLM.
+        - cxt_toks (list[int]): Full context tokens used for lookup.
+        - cont_toks (list[int]): Continuation tokens for which logits were generated.
+        - logits (torch.Tensor [1, seq_length, vocab_size]): Logits generated by the model given context and continuation keys.
+
+        Yields:
+        - Iterator:
+            - req_str (tuple[str, str]): strings used for CachingLM.
+            - cont_toks (list[int]) : continuation tokens.
+            - logits (torch.Tensor [1, seq_length, vocab_size]): The original logits (repeated cache hit times)
+        """
+        if self._group_by == "contexts":
+            cache_hit: List[
+                Tuple[int, Tuple[Tuple[str, str], List[int], List[int]]]
+            ] = self._arr_with_indices.pop(tuple(cxt_toks + cont_toks[:-1]))
+            if (cache_size := len(cache_hit)) == 1:
+                self._reorder_indices.extend(x[0] for x in cache_hit)
+                yield req_str, cont_toks, logits
+            else:
+                # If we have matching requests then expand the batch dimension (no-op) and
+                # yield each along with its corresponding args.
+                multilogits = logits.expand(cache_size, -1, -1).chunk(cache_size)
+                indices, req_str, cont_toks = zip(
+                    *[(x[0], x[1][0], x[-1][-1]) for x in cache_hit]
+                )
+                self._reorder_indices.extend(indices)
+                for c_key, cont_tok, logit in zip(req_str, cont_toks, multilogits):
+                    yield c_key, cont_tok, logit
+        else:
+            yield req_str, cont_toks, logits
+
+    def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> Iterator:
+        """
+        Reorders the elements in the array based on the sorting function.
+
+        Parameters:
+        - arr (list | tuple[tuple[int, Any], ...]]): The array or iterable to be reordered.
+
+        Yields:
+            Iterator
+        """
+        arr = sorted(arr, key=self._sort_fn)
+        if not self._group_by == "contexts":
+            # If grouped by contexts then indices will be set in get_cache()
+            self._reorder_indices.extend([x[0] for x in arr])
+        yield from [x[1] for x in arr]
+
+    def get_original(self, newarr: List) -> List:
+        """
+        Restores the original order of elements from the reordered list.
+
+        Parameters:
+        - newarr (list): The reordered array.
+
+        Returns:
+        list: The array with elements restored to their original order.
+        """
+        res = [None] * self._size
+        cov = [False] * self._size
+
+        for ind, v in zip(self._reorder_indices, newarr):
+            res[ind] = v
+            cov[ind] = True
+
+        assert all(cov)
+
+        return res
+
+    def __len__(self):
+        return self._size
+
+    @staticmethod
+    def group(
+        arr: Iterable,
+        fn: Callable,
+        group_by: Literal["gen_kwargs", "contexts"] = "gen_kwargs",
+    ) -> dict:
+        """
+        Groups elements of an iterable based on a provided function.
+
+
+        The `group_by` parameter determines the method of grouping.
+        If `group_by` is "contexts", the elements are grouped by [context + cont][:-1].
+        If `group_by` is "gen_kwargs", the elements are grouped based on the gen_kwargs dict.
+
+        Parameters:
+        - arr (Iterable): The iterable to be grouped.
+        - fn (Callable): The function to determine the grouping.
+        - values (bool): If True, returns the values of the group. Defaults to False.
+
+        Returns:
+        Iterator: An iterable of grouped elements.
+        """
+        res = collections.defaultdict(list)
+        for ob in arr:
+            # where ob == [context + cont]
+            if group_by == "contexts":
+                res[tuple(fn(ob))].append(ob)
+            else:
+                try:
+                    hashable_dict = tuple(
+                        (
+                            key,
+                            tuple(value)
+                            if isinstance(value, collections.abc.Iterable)
+                            else value,
+                        )
+                        for key, value in sorted(fn(ob).items())
+                    )
+                    res[hashable_dict].append(ob)
+                except (TypeError, AttributeError):
+                    res[tuple(fn(ob))].append(ob)
+        return res
+
+    @staticmethod
+    def get_chunks(_iter, n: int = 0, fn=None):
+        """
+        Divides an iterable into chunks of specified size or based on a given function.
+        Useful for batching
+
+        Parameters:
+        - iter: The input iterable to be divided into chunks.
+        - n: An integer representing the size of each chunk. Default is 0.
+        - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+
+        Returns:
+        An iterator that yields chunks of the input iterable.
+
+        Example usage:
+        ```
+        data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        for chunk in chunks(data, 3):
+            print(chunk)
+        ```
+        Output:
+        ```
+        [1, 2, 3]
+        [4, 5, 6]
+        [7, 8, 9]
+        [10]
+        ```
+        """
+        arr = []
+        _iter = tuple(_iter)
+        for i, x in enumerate(_iter):
+            arr.append(x)
+            if len(arr) == (fn(i, _iter) if fn else n):
+                yield arr
+                arr = []
+
+        if arr:
+            yield arr
+
+
+def configure_pad_token(
+    tokenizer: "PreTrainedTokenizerBase",
+    model_config: Optional["PretrainedConfig"] = None,
+) -> "PreTrainedTokenizerBase":
+    """
+    This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
+    Some tokenizers require special handling.
+
+    Args:
+        tokenizer: The tokenizer for which the padding token is to be handled.
+        model_config: The configuration of the model. Default is None.
+
+    Returns:
+        The tokenizer after the padding token has been handled.
+
+    Raises:
+        AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
+    """
+    if tokenizer.pad_token:
+        pass
+    elif tokenizer.unk_token:
+        tokenizer.pad_token_id = tokenizer.unk_token_id
+    elif tokenizer.eos_token:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    else:
+        # handle special cases
+        if model_config and getattr(model_config, "model_type", None) == "qwen":
+            # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+            tokenizer.pad_token = "<|endoftext|>"
+        elif (
+            tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+            or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+        ):
+            # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+            # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+            # ---
+            # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+            # https://github.com/huggingface/transformers/pull/26963
+            assert tokenizer.pad_token_id == 0
+        else:
+            tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+
+    return tokenizer
diff --git a/scripts/yans/lm-evaluation-harness/scripts/__init__.py b/scripts/yans/lm-evaluation-harness/scripts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/lm-evaluation-harness/scripts/build_benchmark.py b/scripts/yans/lm-evaluation-harness/scripts/build_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc99b5ec37c6979bf55f6a1ac0ea6808fd0e539f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/build_benchmark.py
@@ -0,0 +1,61 @@
+import argparse
+import os
+
+import yaml
+from promptsource.templates import DatasetTemplates
+from tqdm import tqdm
+
+# from lm_eval.api.registry import ALL_TASKS
+from lm_eval.logger import eval_logger
+
+
+# from lm_eval.tasks import include_task_folder
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--benchmark_name", required=True)
+    parser.add_argument("--benchmark_path", required=True)
+    parser.add_argument("--task_save_path", default="lm_eval/tasks/")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    with open(args.benchmark_path, encoding="utf-8") as file:
+        TASK_LIST = yaml.full_load(file)
+        for task in tqdm(TASK_LIST):
+            eval_logger.info(f"Processing {task}")
+
+            dataset_name = task["dataset_path"]
+            if "dataset_name" in task:
+                subset_name = task["dataset_name"]
+                file_subdir = f"{dataset_name}/{subset_name}"
+            else:
+                subset_name = None
+                file_subdir = f"{dataset_name}"
+
+            file_path = os.path.join(args.task_save_path, file_subdir, "promptsource/")
+
+            os.makedirs(file_path, exist_ok=True)
+
+            if subset_name is None:
+                prompts = DatasetTemplates(dataset_name=dataset_name)
+            else:
+                prompts = DatasetTemplates(
+                    dataset_name=dataset_name, subset_name=subset_name
+                )
+
+            for idx, prompt_name in enumerate(prompts.all_template_names):
+                full_file_name = f"promptsource_{idx}.yaml"
+                config_dict = {
+                    "group": args.benchmark_name,
+                    "include": "promptsource_template.yaml",
+                    "use_prompts": f"promptsource:{prompt_name}",
+                }
+
+                file_save_path = os.path.join(file_path, full_file_name)
+                eval_logger.info(f"Save to {file_save_path}")
+                with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+                    yaml.dump(config_dict, yaml_file)
diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/README.md b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7985adecaab926b39e5bfd5b96b093f73450e660
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/README.md
@@ -0,0 +1,36 @@
+janitor.py contains a script to remove benchmark data contamination from training data sets.
+It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.14165).
+
+## Algorithm
+1) Collects all contamination text files that are to be removed from training data
+2) Filters training data by finding `N`gram matches between the training data
+   and any contamination
+   1) `N`grams ignore case and punctuation and are split on whitespace.
+   2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
+    the match, splitting the training data into chunks
+   3) Any chunks less than `minimum_slice_length` are removed
+   4) Training data sets split into more than `too_dirty_cutoff` are considered
+    completely contaminated and removed
+
+OpenAI used:
+```
+ngram_n = 13
+window_to_remove = 200
+minimum_slice_length = 200
+too_dirty_cutoff = 10
+```
+
+## Compiling
+
+Janitor can be used as a pure python program, but it is much faster if the ngram
+code is run in C++. To compile the C++ code, run
+
+```
+pip install pybind11
+c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
+```
+
+MacOS users: If your compiler isn't linked to Python, you may need to add to the above `-undefined dynamic_lookup`. \
+Linux users: If your compiler isn't linked to Python, you may need to follow these steps:
+1. Rename the compiled code file to `janitor_util.so`.
+2. Before running `import Janitor` in your code, add `sys.path.append("your/relative/path/to/janitor_util.so")` so that Python knows the location of `janitor_util.so`.
diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/__init__.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/compress_and_package.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/compress_and_package.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4af5ba5f3d5e16a485984ced2324951e56ad829
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/compress_and_package.py
@@ -0,0 +1,73 @@
+import argparse
+import glob
+import logging
+import os
+import shutil
+import subprocess
+
+from tqdm import tqdm
+from tqdm_multiprocess import TqdmMultiProcessPool
+from tqdm_multiprocess.logger import setup_logger_tqdm
+
+
+logger = logging.getLogger(__name__)
+
+
+def process_task(
+    working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm
+):
+    command = f"zstd {bucket_file_path}"
+    logger.info(command)
+    subprocess.call(command, shell=True)
+
+    compressed_file = bucket_file_path + ".zst"
+    if output_directory:
+        shutil.move(compressed_file, output_directory)
+
+    os.remove(bucket_file_path)
+    global_tqdm.update()
+
+
+def compress_and_move(working_directory, output_directory, process_count):
+    os.makedirs(output_directory, exist_ok=True)
+    original_info_file_path = os.path.join(working_directory, "info.json")
+    assert os.path.exists(original_info_file_path)
+
+    tasks = []
+    bucket_file_paths = glob.glob(
+        os.path.join(working_directory, "output", "*.bkt.txt.sorted")
+    )
+    for bucket_file_path in bucket_file_paths:
+        task = (process_task, (working_directory, output_directory, bucket_file_path))
+        tasks.append(task)
+
+    pool = TqdmMultiProcessPool(process_count)
+
+    def on_done(_):
+        return None
+
+    def on_error(_):
+        return None
+
+    global_progress = tqdm(
+        total=len(bucket_file_paths), dynamic_ncols=True, unit="file"
+    )
+    _ = pool.map(global_progress, tasks, on_error, on_done)
+
+    shutil.copy(original_info_file_path, os.path.join(output_directory, "info.json"))
+
+
+parser = argparse.ArgumentParser(description="sort 13gram buckets")
+parser.add_argument("-dir", "--working_directory", required=True)
+parser.add_argument("-output", "--output_directory", required=True)
+parser.add_argument("-procs", "--process_count", type=int, default=8)
+
+if __name__ == "__main__":
+    version = 1.00
+    print(f"Running version {version}")
+
+    logfile_path = "compress_and_package.log"
+    setup_logger_tqdm(logfile_path)
+
+    args = parser.parse_args()
+    compress_and_move(args.working_directory, args.output_directory, args.process_count)
diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/generate_13_grams.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/generate_13_grams.py
new file mode 100644
index 0000000000000000000000000000000000000000..e508f266e9bfbe1cdf6f93de478c2d60d490d557
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/generate_13_grams.py
@@ -0,0 +1,215 @@
+"""
+Outputs all 13-grams found in The Pile.
+
+Loops through all documents and uses the logic found in janitor.py to extract 13-grams.
+We bucket each 13-gram by hash into separate file buckets to allow easy parallel processing in the
+next stage. We also include the current pile document_id with each ngram instance to allow the
+filtering to exclude 13-grams that match more then 10 unique documents (done further down the pipeline).
+
+We didn't use lm_dataformat to output as it increases time 4x (slow jsonify) and makes
+resuming hard (and we had the storage).
+
+Arguments
+---------
+--working_directory (-dir)
+    Directory containing the pile distribution. An "output" subdirectory will be created underneath
+    to store the bucketed 13-grams, checkpoint and done files. Default: current directory
+--n_value (-n)
+    n value in n-gram, added for later use if ever needed. Default: 13
+--bucket_count (-buckets)
+    Number of file buckets to use when generating 13grams. Default: 500
+"""
+
+import argparse
+import glob
+import json
+import logging
+import os
+import pickle
+import signal
+import sys
+from pathlib import Path
+from signal import SIGINT
+
+from tqdm import tqdm
+from tqdm_multiprocess.logger import setup_logger_tqdm
+
+from lm_eval.decontamination.archiver import Reader, TextArchive
+from lm_eval.decontamination.janitor import Janitor, word_ngrams
+
+
+logger = logging.getLogger(__name__)
+
+terminate = False
+
+
+def handler(signal_received, frame):
+    global terminate
+    terminate = True
+
+
+def yield_pile(start_offsets=None, checkpoint_offset=None):
+    directory = "pile"
+
+    if not os.path.exists(directory):
+        print(
+            "We expect the pile archives to be in the 'pile' directory, but this was not found."
+        )
+        raise Exception("Pile directory not found.")
+
+    files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
+
+    pile_global_offset = 0
+    start_file = 0
+    if checkpoint_offset:
+        for file_i, start_offset in enumerate(start_offsets):
+            if start_offset > checkpoint_offset:
+                break
+
+            start_file = file_i
+            pile_global_offset = start_offset
+
+    for file_i, file in enumerate(files):
+        if file_i < start_file:
+            logger.info(f"Skipping file {file}")
+            continue
+        logger.info(f"Reading from pile file: {file}")
+        reader = Reader()
+        for document in reader.read(file):
+            yield (pile_global_offset, document)
+            pile_global_offset += 1
+
+
+# Hash buckets > disk backed files. Supports file position checkpointing and resuming
+# Allows you to write continuously and checkpoint intermittently. If a failure occurs
+# the buckets are simply truncated at your last checkpoint.
+class Buckets:
+    def __init__(self, directory, num_buckets):
+        self.bucket_files = [
+            os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets)
+        ]
+        self.buckets = list(map(TextArchive, self.bucket_files))
+        self.checkpoint_file = os.path.join(directory, "bucket_offsets.ckpt")
+
+        if os.path.exists(self.checkpoint_file):
+            self.bucket_offsets = pickle.load(open(self.checkpoint_file, "rb"))
+        else:
+            self.bucket_offsets = [0 for i in range(len(self.buckets))]
+
+        for i, offset in enumerate(self.bucket_offsets):
+            bucket = self.buckets[i]
+            bucket.fh.seek(offset)
+            bucket.fh.truncate()
+
+    def add_data(self, key, value):
+        i = hash(key) % len(self.buckets)
+        bucket = self.buckets[i]
+        bucket.add_data(value)
+
+    def save_checkpoint(self):
+        for bucket in self.buckets:
+            bucket.fh.flush()
+
+        bucket_offsets = [bucket.fh.tell() for bucket in self.buckets]
+        pickle.dump(bucket_offsets, open(self.checkpoint_file, "wb"))
+
+    def close_buckets(self):
+        for bucket in self.buckets:
+            bucket.commit()
+
+
+def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
+    pile_statistics = json.load(open("pile_statistics.json", "r", encoding="utf-8"))
+    pile_document_count = pile_statistics["Document Count"]
+    start_offsets = pile_statistics["File Start Offsets"]
+
+    output_directory = os.path.join(working_directory, "output")
+    os.makedirs(output_directory, exist_ok=True)
+
+    logger.info(f"Generating {n_value}-grams and bucketing.")
+
+    # Done file
+    done_file = os.path.join(output_directory, "ngram_buckets.done")
+    if os.path.exists(done_file):
+        logger.info("ngrams already generated and bucketed, skipping")
+        return
+
+    # Checkpoint
+    checkpoint_file = os.path.join(working_directory, "pile_offset.ckpt")
+    if os.path.exists(checkpoint_file):
+        checkpoint_offset = pickle.load(open(checkpoint_file, "rb"))
+        iterate = True
+    else:
+        checkpoint_offset = 0
+        iterate = False
+
+    logger.info(f"Starting at pile document index {checkpoint_offset}")
+    buckets = Buckets(output_directory, bucket_count)
+
+    janitor = Janitor()
+    batch_size = 1000
+    batch_counter = 0
+
+    with tqdm(total=checkpoint_offset, dynamic_ncols=True, unit="docs") as progress:
+        for offset, document in yield_pile(start_offsets, checkpoint_offset):
+            if iterate:
+                logger.info(f"Iterating to offset {checkpoint_offset} from {offset}")
+                progress.update(offset)
+                iterate = False
+
+            if offset < checkpoint_offset:
+                progress.update()
+
+                if terminate:
+                    return
+                continue
+
+            if offset == checkpoint_offset:
+                progress.reset(total=pile_document_count)
+                progress.update(checkpoint_offset)
+
+            # Save checkpoint every "batch_size", only allow terminate after checkpoint
+            if batch_counter == batch_size:
+                progress.update(batch_size)
+                batch_counter = 0
+                buckets.save_checkpoint()
+                pickle.dump(offset, open(checkpoint_file, "wb"))
+                if terminate:
+                    buckets.close_buckets()
+                    return
+
+            ngrams = word_ngrams(janitor.normalize_string(document), n_value)
+            for ngram in ngrams:
+                buckets.add_data(ngram, f"{ngram} {offset}")
+
+            batch_counter += 1
+
+    buckets.close_buckets()
+    Path(done_file).touch()
+
+
+parser = argparse.ArgumentParser(description="Generate 13 grams from Pile.")
+parser.add_argument("-dir", "--working_directory", default="")
+parser.add_argument("-n", "--n_value", type=int, default=13)
+parser.add_argument("-buckets", "--bucket_count", type=int, default=500)
+
+if __name__ == "__main__":
+    version = 1.00
+    print(f"Running version {version}")
+
+    if "PYTHONHASHSEED" not in os.environ or os.environ["PYTHONHASHSEED"] != "0":
+        print("Please run 'export PYTHONHASHSEED=0' before running generate.")
+        sys.exit()
+
+    # Handle sigint (ctrl-c) cleanly
+    previous_signal_int = signal.signal(SIGINT, handler)
+
+    logfile_path = "ngrams.log"
+    setup_logger_tqdm(logfile_path)
+
+    args = parser.parse_args()
+    do_ngrams_in_buckets(args.n_value, args.working_directory, args.bucket_count)
+
+    info_dict = {"title": "dataset ngrams", "ngram_size": 13}
+    info_dict_path = os.path.join(args.working_directory, "info.json")
+    json.dump(info_dict, open(info_dict_path, "w", encoding="utf-8"))
diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/investigate_pile.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/investigate_pile.py
new file mode 100644
index 0000000000000000000000000000000000000000..681b591ced535dbb884fb65f58a0c9042c35b0ac
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/investigate_pile.py
@@ -0,0 +1,95 @@
+import glob
+import json
+import os
+from functools import reduce
+
+import tqdm
+from tqdm_multiprocess import TqdmMultiProcessPool
+
+from lm_eval.decontamination.archiver import Reader
+
+
+def get_file_stats(file_path, tqdm_func, global_tqdm):
+    reader = Reader()
+    total_documents = 0
+    total_size = 0
+    update_frequency = 10000
+    current_file_position = 0
+
+    with tqdm_func(
+        total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1
+    ) as progress:
+        for document in reader.read(file_path, get_meta=True):
+            total_size += len(document)
+            total_documents += 1
+
+            if total_documents % update_frequency == 0:
+                new_file_pos = reader.fh.tell()
+                bytes_read = new_file_pos - current_file_position
+                current_file_position = new_file_pos
+                progress.update(bytes_read)
+                global_tqdm.update(bytes_read)
+
+    return (total_documents, total_size)
+
+
+def get_files():
+    directory = "pile"
+    files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
+    print(files)
+    return files
+
+
+def get_stats():
+    files = get_files()
+    total_size_bytes = sum(map(lambda x: os.path.getsize(x), files))
+
+    pool = TqdmMultiProcessPool(4)
+    global_tqdm = tqdm.tqdm(
+        total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1
+    )
+
+    # Generate minhashes with pool
+    tasks = [(get_file_stats, (file,)) for file in files]
+
+    def on_done(_):
+        return None
+
+    def on_error(_):
+        return None
+
+    results = pool.map(global_tqdm, tasks, on_error, on_done)
+
+    total_documents, total_size = reduce(
+        lambda x, y: (x[0] + y[0], x[1] + y[1]), results
+    )
+
+    start_offsets = []
+    current_offset = 0
+    for file_document_count, _ in results:
+        start_offsets.append(current_offset)
+        current_offset += file_document_count
+
+    return (total_documents, total_size, start_offsets)
+
+
+if __name__ == "__main__":
+    version = 1.01
+    print(f"Running version {version}")
+
+    stats_file_path = "pile_statistics.json"
+    if os.path.exists(stats_file_path):
+        stats = json.load(open(stats_file_path, "r", encoding="utf-8"))
+    else:
+        document_count, total_document_size_chars, start_offsets = get_stats()
+        stats = {
+            "Data": "Pile statistics",
+            "Document Count": document_count,
+            "Total Pile Characters": total_document_size_chars,
+            "File Start Offsets": start_offsets,
+        }
+        json.dump(stats, open(stats_file_path, "w", encoding="utf-8"), indent=4)
+
+    print(f"document_count: {stats['Document Count']}")
+    print(f"total_chars: {stats['Total Pile Characters']}")
+    print(f"start_offsets: {stats['File Start Offsets']}")
diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/janitor_util.cpp b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/janitor_util.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..858a8b20492507a6228a640cef0cc3ec7ac56bca
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/janitor_util.cpp
@@ -0,0 +1,208 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <queue>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+bool is_whitespace(char ch) noexcept {
+  // " \t\n\r\x0b\x0c" (python string.whitespace)
+  return ch == 32 or (9 <= ch and ch <= 13);
+  //    return ch <= 32; // arguably too general, but slightly faster
+}
+
+bool is_punctuation(char c) noexcept {
+  // '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'      ascii values:    33-47,  58-64,
+  // 91-96,  123-126
+  return (33 <= c and c <= 47) or (58 <= c and c <= 64) or
+         (91 <= c and c <= 96) or (123 <= c and c <= 126);
+}
+
+// Takes a string and makes ngrams of length N, splitting grams on whitespace
+// and ignoring ignored characters Returns a LARGE array of ngrams
+std::vector<std::string> clean_ngram(std::string const &input,
+                                     std::string const &ignore,
+                                     size_t ngram_n) noexcept {
+
+  size_t num_grams = 0;
+  std::vector<std::string> ngram_list;
+  std::vector<uint8_t> gram_lengths;
+  std::string current_ngram;
+
+  // Max gram length is set to 10 below.
+  current_ngram.reserve(11 * ngram_n);
+  gram_lengths.reserve(ngram_n);
+
+  bool started_gram = false;
+  gram_lengths.push_back(0);
+
+  // for (size_t i=0; i<input.length(); i++) {
+  //  this is slightly faster, and we don't need the index in this one
+  for (auto iter = input.begin(); iter != input.end(); iter++) {
+
+    // If whitespace, end the current ngram and start the next
+    // alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ...
+    // }
+    if (is_whitespace(*iter) || gram_lengths.back() > 10) {
+
+      // Skip all whitespace
+      while (++iter != input.end() && is_whitespace(*iter))
+        ;
+      iter--;
+
+      if (started_gram) {
+        num_grams += 1;
+
+        // Building 1grams is a special case
+        if (ngram_n == 1) {
+          ngram_list.push_back(current_ngram);
+          current_ngram = current_ngram.substr(gram_lengths.front());
+          gram_lengths.back() = 0;
+
+          // If there are enough grams to form an ngram, save
+        } else if (num_grams >= ngram_n) {
+          // Save the current ngram
+          ngram_list.push_back(current_ngram);
+
+          // Start the next ngram by dropping the first gram and its space from
+          // the ngram
+          current_ngram = current_ngram.substr(gram_lengths.front() + 1);
+          current_ngram += ' ';
+
+          // Drop the length of the first gram and prepare to record the length
+          // of the new gram
+          gram_lengths.erase(gram_lengths.begin());
+          gram_lengths.push_back(0);
+
+          // Otherwise, continue building
+        } else {
+          current_ngram += ' ';
+          gram_lengths.push_back(0);
+        }
+
+        started_gram = false;
+      }
+
+      // Skip ignored characters
+      // alternatively, (perhaps marginally) faster: if (is_punctuation(ch))
+      // continue;
+    } else if (ignore.find(*iter) != std::string::npos) {
+      continue;
+    }
+
+    // If it is a non-ignored character, add it to the ngram and update the last
+    // gram's length
+    else {
+      current_ngram += tolower(*iter);
+      gram_lengths.back() += 1;
+      started_gram = true;
+    }
+  }
+
+  return ngram_list;
+}
+
+// Takes a string and makes ngrams of length N, splitting grams on whitespace
+// and ignoring ignored characters Returns a LARGE array of tuples of (ngram,
+// start_idx, end_idx)
+std::vector<std::tuple<std::string, size_t, size_t>>
+clean_ngram_with_indices(std::string const &input, std::string const &ignore,
+                         size_t ngram_n) noexcept {
+
+  size_t num_grams = 0;
+  std::vector<std::tuple<std::string, size_t, size_t>> ngram_list;
+  std::vector<uint8_t> gram_lengths;
+  std::vector<size_t> gram_start_indices;
+  std::string current_ngram;
+
+  // Max gram length is set to 10 below.
+  current_ngram.reserve(11 * ngram_n);
+
+  bool started_gram = false;
+  gram_lengths.push_back(0);
+  gram_start_indices.push_back(0);
+
+  for (size_t i = 0; i < input.length(); i++) {
+    char ch = input[i];
+
+    // If whitespace, end the current ngram and start the next
+    if (is_whitespace(ch) || gram_lengths.back() > 10) {
+
+      // Skip all whitespace
+      while (++i < input.length() && is_whitespace(input[i]))
+        ;
+      i--;
+
+      if (started_gram) {
+        num_grams += 1;
+
+        // Building 1grams is a special case
+        if (ngram_n == 1) {
+          ngram_list.push_back(
+              std::make_tuple(current_ngram, gram_start_indices.front(), i));
+          current_ngram = current_ngram.substr(gram_lengths.front());
+          gram_lengths.back() = 0;
+          gram_start_indices.back() = i + 1;
+
+          // If there are enough grams to form an ngram, save
+        } else if (num_grams >= ngram_n) {
+
+          // Save the current ngram
+          ngram_list.push_back(
+              std::make_tuple(current_ngram, gram_start_indices.front(), i));
+
+          // Start the next ngram by dropping the first gram and its space from
+          // the ngram
+          current_ngram = current_ngram.substr(gram_lengths.front() + 1);
+          current_ngram += ' ';
+
+          // Drop the length of the first gram and prepare to record the length
+          // of the new gram
+          gram_lengths.erase(gram_lengths.begin());
+          gram_lengths.push_back(0);
+
+          gram_start_indices.erase(gram_start_indices.begin());
+          gram_start_indices.push_back(i + 1);
+
+          // Otherwise, continue building
+        } else {
+          current_ngram += ' ';
+          gram_lengths.push_back(0);
+          gram_start_indices.push_back(i + 1);
+        }
+
+        started_gram = false;
+      }
+
+      // Skip ignored characters
+    } else if (ignore.find(ch) != std::string::npos) {
+      continue;
+
+      // If it is a non-ignored character, add it to the ngram and update the
+      // last gram's length
+    } else {
+      current_ngram += tolower(ch);
+      gram_lengths.back() += 1;
+      started_gram = true;
+    }
+  }
+
+  return ngram_list;
+}
+
+PYBIND11_MODULE(janitor_util, m) {
+  m.doc() = "pybind11 example plugin"; // optional module docstring
+  //    m.def("add", &add, "A function which adds two numbers");  // example
+  //    function
+  m.def("clean_ngram", &clean_ngram,
+        "Create ngrams of words, ignoring some characters");
+  m.def("clean_ngram_with_indices", &clean_ngram_with_indices,
+        "Create ngrams of words with indices, ignoring some characters");
+}
+
+// Example compile
+// c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes)
+// janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) If
+// python and gcc aren't linked, append to the above:    -undefined
+// dynamic_lookup
diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/process_sorted_buckets.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/process_sorted_buckets.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d345d8e86f409495b95a73f4539b2f4df57af70
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/process_sorted_buckets.py
@@ -0,0 +1,129 @@
+"""
+Processes each sorted bucket, creating a new file listing all ngrams that matched more then 10
+unique documents with their unique document counts. Uses multiprocessing and very little memory
+as we stream from presorted buckets. Will use a lot of disk though.
+
+Arguments
+---------
+--working_directory (-dir)
+    Directory containing the sorted buckets, processed files will be deposited here. Default: current directory
+--move_dir (-move)
+    Directory to move processed 13grams too. Default: Do nothing
+--process_count (-procs)
+    Number of processes to use. Default: 4
+"""
+
+import argparse
+import glob
+import logging
+import os
+import re
+import shutil
+from pathlib import Path
+
+from tqdm import tqdm
+from tqdm_multiprocess import TqdmMultiProcessPool
+from tqdm_multiprocess.logger import setup_logger_tqdm
+
+from scripts.clean_training_data.archiver import TextArchive, TextReader
+
+
+logger = logging.getLogger(__name__)
+
+
+# Multiprocessed
+def process_bucket(
+    bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm
+):
+    bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))  # noqa: W605
+    done_file = os.path.join(
+        processed_directory, f"ngram_bucket_processing_{bucket_id}.done"
+    )
+    if os.path.exists(done_file):
+        logger.info(f"bucket {bucket_id} already processed, skipping")
+        return
+
+    # For managing tqdm
+    file_size = os.path.getsize(bucket_file_path)
+    bucket_progress = tqdm_func(
+        total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1
+    )
+    current_file_position = 0
+    update_frequency = 100 * 1000000  # 100mb
+    update_counter = 0
+
+    # Iterate through and output ngrams which occur in more then 10 documents
+    bucket = TextReader(bucket_file_path)
+
+    output_file_path = bucket_file_path + ".processed"
+    output_archive = TextArchive(output_file_path, mode="wb")
+
+    current_ngram = ""
+    current_ngram_document_ids = set()
+    for line in bucket.read():
+        [ngram, document_id] = line.rsplit(" ", 1)
+
+        # Write ngram if more then 10 unique document occurrences
+        if ngram != current_ngram:
+            if len(current_ngram_document_ids) > 10:
+                output_archive.add_data(
+                    f"{current_ngram} {len(current_ngram_document_ids)}"
+                )
+            current_ngram = ngram
+            current_ngram_document_ids = set()
+
+        current_ngram_document_ids.add(document_id)
+
+        # Update tqdm
+        update_counter += bucket.fh.tell() - current_file_position
+        current_file_position = bucket.fh.tell()
+        if update_counter > update_frequency:
+            bucket_progress.update(update_counter)
+            update_counter = 0
+
+    # Remainder
+    if len(current_ngram_document_ids) > 10:
+        output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}")
+
+    output_archive.commit()
+    Path(done_file).touch()
+
+    if move_dir:
+        shutil.move(output_file_path, move_dir)
+
+    global_tqdm.update()
+
+
+def process_sorted_buckets(working_directory, move_dir, process_count):
+    bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt.sorted"))
+    processed_directory = os.path.join(working_directory, "processed")
+    os.makedirs(processed_directory, exist_ok=True)
+
+    pool = TqdmMultiProcessPool(process_count)
+    tasks = [
+        (process_bucket, (bucket_file, processed_directory, move_dir))
+        for bucket_file in bucket_file_paths
+    ]
+
+    global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket")
+
+    def on_done(_):
+        return None
+
+    def on_error(_):
+        return None
+
+    _ = pool.map(global_tqdm, tasks, on_error, on_done)
+
+
+parser = argparse.ArgumentParser(description="Process 13 grams from sorted buckets.")
+parser.add_argument("-dir", "--working_directory", default="")
+parser.add_argument("-move", "--move_dir", default="")
+parser.add_argument("-procs", "--process_count", type=int, default=4)
+
+if __name__ == "__main__":
+    logfile_path = "process13grams.log"
+    setup_logger_tqdm(logfile_path)
+
+    args = parser.parse_args()
+    process_sorted_buckets(args.working_directory, args.move_dir, args.process_count)
diff --git a/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/sort_13_gram_buckets.py b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/sort_13_gram_buckets.py
new file mode 100644
index 0000000000000000000000000000000000000000..83990de822e333bcd16c8d8092aec7ce41ff4e94
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/clean_training_data/sort_13_gram_buckets.py
@@ -0,0 +1,62 @@
+"""
+Iteratively runs gnu sort on each bucket, uses up to 8 cores.
+
+Arguments
+---------
+--working_directory (-dir)
+    Directory containing the bucketed 13-grams. Sorted buckets will be deposited in the same
+    directory and the unsorted buckets are removed after.
+"""
+
+import argparse
+import glob
+import logging
+import os
+import signal
+import subprocess
+from signal import SIGINT
+
+from tqdm import tqdm
+from tqdm_multiprocess.logger import setup_logger_tqdm
+
+
+logger = logging.getLogger(__name__)
+
+terminate = False
+
+
+def handler(signal_received, frame):
+    global terminate
+    terminate = True
+
+
+def sort_13_gram_buckets(working_directory):
+    bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt"))
+
+    for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
+        sorted_file_path = bucket_file_path + ".sorted"
+        command = f"sort {bucket_file_path} > {sorted_file_path}"
+        logger.info(command)
+        subprocess.call(command, shell=True)
+
+        if terminate:
+            return
+
+        os.remove(bucket_file_path)
+
+
+parser = argparse.ArgumentParser(description="sort 13gram buckets")
+parser.add_argument("-dir", "--working_directory", default="")
+
+if __name__ == "__main__":
+    version = 1.00
+    print(f"Running version {version}")
+
+    # Handle sigint (ctrl-c) cleanly
+    previous_signal_int = signal.signal(SIGINT, handler)
+
+    logfile_path = "sort13grambuckets.log"
+    setup_logger_tqdm(logfile_path)
+
+    args = parser.parse_args()
+    sort_13_gram_buckets(args.working_directory)
diff --git a/scripts/yans/lm-evaluation-harness/scripts/cost_estimate.py b/scripts/yans/lm-evaluation-harness/scripts/cost_estimate.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf81147547b0a7a92e52904c70cb11d246f680b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/cost_estimate.py
@@ -0,0 +1,99 @@
+import random
+
+import transformers
+
+from lm_eval import evaluator, tasks
+from lm_eval.api.model import LM
+
+
+class DryrunLM(LM):
+    def __init__(self):
+        self.tokencost = 0
+        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
+        self.tokenizer.pad_token = "<|endoftext|>"
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string):
+        return cls()
+
+    def loglikelihood(self, requests):
+        res = []
+
+        for ctx, cont in requests:
+            res.append((-random.random(), False))
+            self.tokencost += len(self.tokenizer.tokenize(ctx + cont))
+
+        return res
+
+    def generate_until(self, requests):
+        res = []
+
+        for ctx, _ in requests:
+            res.append("lol")
+
+            # assume worst case - generates until 256
+            self.tokencost += len(self.tokenizer.tokenize(ctx)) + 256
+
+        return res
+
+    def loglikelihood_rolling(self, requests):
+        res = []
+
+        for (s,) in requests:
+            # assume worst case: extra full context
+            self.tokencost += len(self.tokenizer.tokenize(s)) + 2048
+
+        return res
+
+
+def main():
+    lm = DryrunLM()
+
+    task_list = "arc_challenge,arc_easy,boolq,cola,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,record,rte,sciq,sst,triviaqa,webqs,wic,wikitext,winogrande,wnli,wsc"
+    values = []
+    for taskname in task_list.split(","):
+        lm.tokencost = 0
+        evaluator.simple_evaluate(
+            lm=lm,
+            task_dict={taskname: tasks.get_task(taskname)()},
+            num_fewshot=0,
+            limit=None,
+            bootstrap_iters=10,
+        )
+
+        print(taskname, lm.tokencost)
+        values.append(
+            [
+                taskname,
+                lm.tokencost,
+                lm.tokencost / 1000 * 0.0008,
+                lm.tokencost / 1000 * 0.0012,
+                lm.tokencost / 1000 * 0.006,
+                lm.tokencost / 1000 * 0.06,
+            ]
+        )
+    from pytablewriter import MarkdownTableWriter
+
+    writer = MarkdownTableWriter()
+    writer.headers = ["Task", "Tokens", "Ada", "Babbage", "Curie", "Davinci"]
+
+    values.sort(key=lambda x: -x[1])
+    totcost = sum([x[1] for x in values])
+    values.append(
+        [
+            "**Total**",
+            totcost,
+            totcost / 1000 * 0.0008,
+            totcost / 1000 * 0.0012,
+            totcost / 1000 * 0.006,
+            totcost / 1000 * 0.06,
+        ]
+    )
+
+    writer.value_matrix = values
+
+    print(writer.dumps())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/yans/lm-evaluation-harness/scripts/get_prompts.py b/scripts/yans/lm-evaluation-harness/scripts/get_prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..d262ec37e40f229c2009f9f162cc58834291de12
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/get_prompts.py
@@ -0,0 +1,25 @@
+from itertools import islice
+
+from lm_eval import tasks
+
+
+ct = 3
+
+for (
+    tname,
+    Task,
+) in tasks.TASK_REGISTRY.items():  # [('record', tasks.superglue.ReCoRD)]:#
+    task = Task()
+
+    print("#", tname)
+    docs = islice(
+        task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct
+    )
+    print()
+    for i in range(ct):
+        print()
+        doc = next(docs)
+        print("**Context**:", "\n```\n" + task.doc_to_text(doc) + "\n```\n")
+        print()
+        print("**Target**:", "\n```\n" + task.doc_to_target(doc) + "\n```\n")
+        print()
diff --git a/scripts/yans/lm-evaluation-harness/scripts/make_gpt2_test_cases.py b/scripts/yans/lm-evaluation-harness/scripts/make_gpt2_test_cases.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c1a4bffe03ef057c331dc9a20c0a5eadb46be66
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/make_gpt2_test_cases.py
@@ -0,0 +1,48 @@
+import random
+
+import torch
+import torch.nn.functional as F
+import transformers
+
+
+random.seed(42)
+
+
+data = [
+    "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
+    "The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons (with threshold activation); see § Terminology",
+    'Multilayer perceptrons are sometimes colloquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]',
+    "An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear activation function.",
+    "MLP utilizes a supervised learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]",
+    "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. ",
+    "Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.",
+    "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
+    "Hello World",
+]
+
+
+model = transformers.GPT2LMHeadModel.from_pretrained("gpt2")
+tok = transformers.GPT2Tokenizer.from_pretrained("gpt2")
+
+tgs = []
+
+for dat in data:
+    random.seed(dat)
+    # print(model(tok.encode(dat, return_tensors="pt"))[0][0])
+
+    toks = tok.encode(dat, return_tensors="pt")
+    ind = random.randrange(len(toks[0]) - 1)
+    logits = F.log_softmax(model(toks)[0], dim=-1)[:, :-1]  # [batch, seq, vocab]
+
+    res = torch.gather(logits, 2, toks[:, 1:].unsqueeze(-1)).squeeze(-1)[0]
+
+    tgs.append(float(res[ind:].sum()))
+    print(
+        r'("""'
+        + tok.decode(toks[0, : ind + 1])
+        + r'""", """'
+        + tok.decode(toks[0, ind + 1 :])
+        + r'"""), '
+    )
+
+print(tgs)
diff --git a/scripts/yans/lm-evaluation-harness/scripts/make_table_results.py b/scripts/yans/lm-evaluation-harness/scripts/make_table_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..59eddb4a4fdac05c1d2ce3623a7bd4312101bec2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/make_table_results.py
@@ -0,0 +1,75 @@
+"""
+Usage:
+   python make_table_tasks.py --output <markdown_filename>
+"""
+
+import json
+import logging
+import os
+
+from pytablewriter import LatexTableWriter, MarkdownTableWriter
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def make_table(result_dict):
+    """Generate table of results."""
+    md_writer = MarkdownTableWriter()
+    latex_writer = LatexTableWriter()
+    md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
+    latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
+
+    values = []
+
+    for k, dic in sorted(result_dict["results"].items()):
+        version = result_dict["versions"][k]
+        percent = k == "squad2"
+        for m, v in dic.items():
+            if m.endswith("_stderr"):
+                continue
+
+            if m + "_stderr" in dic:
+                se = dic[m + "_stderr"]
+                if percent or m == "ppl":
+                    values.append([k, version, m, "%.2f" % v, "±", "%.2f" % se])
+                else:
+                    values.append(
+                        [k, version, m, "%.2f" % (v * 100), "±", "%.2f" % (se * 100)]
+                    )
+            else:
+                if percent or m == "ppl":
+                    values.append([k, version, m, "%.2f" % v, "", ""])
+                else:
+                    values.append([k, version, m, "%.2f" % (v * 100), "", ""])
+            k = ""
+            version = ""
+    md_writer.value_matrix = values
+    latex_writer.value_matrix = values
+
+    # todo: make latex table look good
+    # print(latex_writer.dumps())
+
+    return md_writer.dumps()
+
+
+if __name__ == "__main__":
+    # loop dirs and subdirs in results dir
+    # for each dir, load json files
+    for dirpath, dirnames, filenames in os.walk("../results"):
+        # skip dirs without files
+        if not filenames:
+            continue
+        path_readme = os.path.join(dirpath, "README.md")
+        with open(path_readme, "w", encoding="utf-8") as f:
+            # get path name, only last folder
+            path_name = dirpath.split("/")[-1]
+            f.write(f"# {path_name} \n\n")
+        for filename in sorted([f for f in filenames if f.endswith(".json")]):
+            path = os.path.join(dirpath, filename)
+            with open(path, "r", encoding="utf-8") as f:
+                result_dict = json.load(f)
+            with open(path_readme, "a", encoding="utf-8") as f:
+                f.write(f"## {filename} \n")
+                f.write(f"{make_table(result_dict)} \n")
diff --git a/scripts/yans/lm-evaluation-harness/scripts/make_table_tasks.py b/scripts/yans/lm-evaluation-harness/scripts/make_table_tasks.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a3b19634b11eb9974a32dcd2a80cab3f0940f9e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/make_table_tasks.py
@@ -0,0 +1,55 @@
+"""
+Usage:
+   python make_table_tasks.py --output <markdown_filename>
+"""
+
+import argparse
+import logging
+
+from pytablewriter import MarkdownTableWriter
+
+from lm_eval import tasks
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def check(tf):
+    if tf:
+        return "✓"
+    else:
+        return " "
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output", type=str, default="task_table.md")
+    args = parser.parse_args()
+
+    writer = MarkdownTableWriter()
+    writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"]
+    values = []
+
+    tasks = tasks.TASK_REGISTRY.items()
+    tasks = sorted(tasks, key=lambda x: x[0])
+    for tname, Task in tasks:
+        task = Task()
+        v = [
+            tname,
+            check(task.has_training_docs()),
+            check(task.has_validation_docs()),
+            check(task.has_test_docs()),
+            len(
+                list(
+                    task.test_docs() if task.has_test_docs() else task.validation_docs()
+                )
+            ),
+            ", ".join(task.aggregation().keys()),
+        ]
+        logger.info(v)
+        values.append(v)
+    writer.value_matrix = values
+    table = writer.dumps()
+    with open(args.output, "w", encoding="utf-8") as f:
+        f.write(table)
diff --git a/scripts/yans/lm-evaluation-harness/scripts/model_comparator.py b/scripts/yans/lm-evaluation-harness/scripts/model_comparator.py
new file mode 100644
index 0000000000000000000000000000000000000000..55f4f3b15468b2f46e590cbfd82d7902f1d9a16f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/model_comparator.py
@@ -0,0 +1,139 @@
+import argparse
+import os
+from typing import Dict, List, Tuple
+
+import numpy as np
+import pandas as pd
+import torch
+
+import lm_eval.evaluator
+import lm_eval.models.utils
+from lm_eval import tasks, utils
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+eval_logger = utils.eval_logger
+
+
+def memory_stats():
+    eval_logger.info(
+        f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}"
+    )
+
+
+def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
+    from scipy.stats.norm import sf
+
+    acc1, acc2 = res1["acc,none"], res2["acc,none"]
+    st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
+    Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
+    # Determining the p-value
+    p_value = 2 * sf(abs(Z))  # two-tailed test
+    return Z, p_value
+
+
+def print_results(
+    data_to_print: List = None, results_dict: Dict = None, alpha: float = None
+):
+    model1_data = data_to_print[0]
+    model2_data = data_to_print[1]
+    table_data = []
+    for task in model1_data.keys():
+        row = {
+            "Task": task,
+            "HF Accuracy": model1_data[task]["acc,none"],
+            "vLLM Accuracy": model2_data[task]["acc,none"],
+            "HF StdErr": model1_data[task]["acc_stderr,none"],
+            "vLLM StdErr": model2_data[task]["acc_stderr,none"],
+        }
+        table_data.append(row)
+    comparison_df = pd.DataFrame(table_data)
+    comparison_df["Z-Score"] = comparison_df["Task"].apply(
+        lambda task: results_dict[task]["z"]
+    )
+    comparison_df["P-Value"] = comparison_df["Task"].apply(
+        lambda task: results_dict[task]["p_value"]
+    )
+    comparison_df[f"p > {alpha}"] = comparison_df["P-Value"].apply(
+        lambda p: "✓" if p > alpha else "×"
+    )
+    return comparison_df
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained", default="EleutherAI/pythia-70m", help="name of model to compare"
+    )
+    parser.add_argument(
+        "--hf_args", help="huggingface model args <arg>=<value>", default=""
+    )
+    parser.add_argument("--vllm_args", help="vllm model args <arg>=<value>", default="")
+    parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag")
+    parser.add_argument(
+        "--limit",
+        type=float,
+        default=100,
+    )
+    parser.add_argument(
+        "--alpha",
+        type=float,
+        default=0.05,
+        help="Significance level for two-tailed z-test",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+    )
+    parser.add_argument(
+        "--batch",
+        type=str,
+        default=8,
+    )
+    parser.add_argument(
+        "--verbosity",
+        type=str,
+        default="INFO",
+        help="Logging verbosity",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    tasks.initialize_tasks()
+    args = parse_args()
+    tasks = args.tasks.split(",")
+    print(tasks)
+    hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args
+    results_vllm = lm_eval.evaluator.simple_evaluate(
+        model="vllm",
+        model_args=f"pretrained={args.pretrained}" + vllm_args,
+        tasks=tasks,
+        limit=args.limit,
+        device=args.device,
+        batch_size=args.batch,
+    )
+    memory_stats()
+    lm_eval.models.utils.clear_torch_cache()
+    eval_logger.info("Memory stats cleared")
+    memory_stats()
+    results_hf = lm_eval.evaluator.simple_evaluate(
+        model="hf",
+        model_args=f"pretrained={args.pretrained}" + hf_args,
+        tasks=tasks,
+        limit=args.limit,
+        device=args.device,
+        batch_size=args.batch,
+    )
+    all_res = {}
+    for task1, task2 in zip(
+        results_hf["results"].items(), results_vllm["results"].items()
+    ):
+        assert task1[0] == task2[0]
+        z, p_value = calculate_z_value(task1[1], task2[1])
+        all_res[task1[0]] = {"z": z, "p_value": p_value}
+    df = print_results(
+        [results_hf["results"], results_vllm["results"]], all_res, args.alpha
+    )
+    print(df)
diff --git a/scripts/yans/lm-evaluation-harness/scripts/regression.py b/scripts/yans/lm-evaluation-harness/scripts/regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..75258dcb640a4f32a0011e864d390e9619f6e2e3
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/regression.py
@@ -0,0 +1,199 @@
+import argparse
+import json
+import os
+import subprocess
+import time
+from pathlib import Path
+
+from lm_eval import utils
+from lm_eval.api.registry import ALL_TASKS
+
+
+seq2seq_models = ["google/flan-t5-small"]
+causal_models = [
+    "gpt2",
+    "facebook/opt-125m",
+    "EleutherAI/gpt-neo-125m",
+    "EleutherAI/pythia-160m",
+]
+model_names = seq2seq_models + causal_models
+
+
+completion_tasks = ["boolq", "lambada_openai", "winogrande"]
+choice_tasks = ["hellaswag", "openbookqa", "piqa"]
+perplexity_tasks = ["wikitext"]
+generation_tasks = []
+task_names = completion_tasks + choice_tasks + perplexity_tasks + generation_tasks
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--branches", default=[])
+    parser.add_argument("--models", default=model_names)
+    parser.add_argument("--tasks", default=task_names)
+    parser.add_argument("--acc_norm", type=bool, default=False)
+    parser.add_argument("--perplexity", default=None)
+    # TODO: implement num_fewshot and limit per task, e.g. task1:5,task2:1:100,task3::1000
+    parser.add_argument("--num_fewshot", type=int, default=0)
+    parser.add_argument("--limit", type=float, default=None)
+    # TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
+    parser.add_argument("--model", default="hf-causal")
+    # Use whatever is faster here
+    parser.add_argument("--model_args", default="use_accelerate=True,load_in_8bit=True")
+    parser.add_argument("--batch_size", default="auto")
+    return parser.parse_args()
+
+
+def eval_models(args, branch=None):
+    if branch is not None:
+        if os.system(f"git checkout {branch}") != 0:
+            return {}, 0
+
+    branch = branch or initial_branch
+
+    start_time = time.time()
+
+    results = {}
+
+    for model in args.models:
+        model_type = (
+            "hf-causal"
+            if model in causal_models
+            else "hf-seq2seq"
+            if model in seq2seq_models
+            else args.model
+        )
+        model_args = f"pretrained={model},{args.model_args}"
+        # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
+        tasks = (
+            args.tasks
+            if model in causal_models or model_type == "hf-causal"
+            else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
+        )
+        # TODO: OOM with auto for seq2seq models, also can OOM with llama
+        batch_size = (
+            args.batch_size
+            if model in causal_models or model_type == "hf-causal"
+            else 64
+            if args.batch_size == "auto"
+            else args.batch_size
+        )
+        output_path = (
+            f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
+        )
+
+        command = (
+            f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
+            f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
+            f"--batch_size {batch_size} --no_cache --output_path {output_path}"
+        )
+
+        print(
+            f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
+        )
+
+        ret = os.system(command)
+
+        results[model] = (
+            json.load(open(output_path, encoding="utf-8"))
+            if ret == 0
+            else {"results": {}}
+        )
+
+    end_time = time.time()
+
+    return results, end_time - start_time
+
+
+def extract_value(args, results, model, task, err=False):
+    if model not in results:
+        return 0
+    results = results[model]["results"]
+    if task not in results:
+        return 0
+    results = results[task]
+    if args.acc_norm and "acc_norm,none" in results:
+        return results["acc_norm,none"] if not err else results["acc_norm_stderr,none"]
+    if "acc,none" in results:
+        return results["acc,none"] if not err else results["acc_stderr,none"]
+    if (args.perplexity or "word_perplexity") + ",none" in results:
+        return (
+            results[(args.perplexity or "word_perplexity") + ",none"] if not err else 0
+        )
+    return 0
+
+
+def format_value(args, results, model, task):
+    val = 100 * extract_value(args, results, model, task)
+    err = 100 * extract_value(args, results, model, task, err=True)
+    return f"{val:.2f}{f' ± {err:.2f}' if err != 0 else ''}"
+
+
+def format_diff(args, results1, results2, model, task):
+    val1 = 100 * extract_value(args, results1, model, task)
+    val2 = 100 * extract_value(args, results2, model, task)
+    diff = val2 - val1
+    return f"**+{diff:.2f}**" if diff > 0 else f"{diff:.2f}"
+
+
+def main():
+    args = parse_args()
+
+    args.branches = (
+        args.branches.split(",") if isinstance(args.branches, str) else args.branches
+    )
+    args.models = (
+        args.models.split(",") if isinstance(args.models, str) else args.models
+    )
+    args.tasks = (
+        ALL_TASKS
+        if args.tasks == "all_tasks"
+        else utils.pattern_match(args.tasks.split(","), ALL_TASKS)
+        if isinstance(args.tasks, str)
+        else args.tasks
+    )
+
+    global initial_branch
+    initial_branch = (
+        subprocess.check_output("git branch --show-current", shell=True)
+        .decode("ascii")
+        .strip()
+    )
+
+    # TODO: implement proper timing for each task
+    # TODO: reduce IO by sharing tasks between models?
+
+    results, runtime = eval_models(args)
+    print(results, runtime)
+
+    runs = []
+    for branch in args.branches:
+        runs.append((branch, *eval_models(args, branch)))
+
+    os.system(f"git checkout {initial_branch}")
+
+    print("")
+    print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
+    print(f"|--|{'--|' * len(args.models)}")
+    for task in args.tasks:
+        print(
+            f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|"
+        )
+        for branch, branch_results, branch_runtime in runs:
+            print(
+                f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|"
+            )
+            print(
+                f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|"
+            )
+
+    print("")
+    print("|branch|runtime|%|")
+    print("|--|--|--|")
+    print(f"|{initial_branch}|{runtime:.1f}s|100%|")
+    for branch, _, branch_runtime in runs:
+        print(f"|{branch}|{branch_runtime:.1f}s|{100 * branch_runtime / runtime:.2f}%|")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/yans/lm-evaluation-harness/scripts/requests_caching.py b/scripts/yans/lm-evaluation-harness/scripts/requests_caching.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aaf323485606c61b435fe0f3ab5a6c97b5561b5
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/requests_caching.py
@@ -0,0 +1,92 @@
+"""
+Usage:
+   python requests_caching.py --tasks=comma,separated,list,of,tasks --cache_requests=<true|refresh|delete]>
+"""
+
+import argparse
+import os
+from typing import List
+
+import torch
+from transformers import (
+    pipeline as trans_pipeline,
+)
+
+from lm_eval import simple_evaluate
+from lm_eval.evaluator import request_caching_arg_to_dict
+from lm_eval.utils import eval_logger
+
+
+MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
+
+# Used to specify alternate cache path, useful if run in a docker container
+# NOTE raw datasets will break if you try to transfer the cache from your host to a docker image
+LM_HARNESS_CACHE_PATH = os.getenv("LM_HARNESS_CACHE_PATH")
+
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+MODEL = "EleutherAI/pythia-70m"
+
+TASK = "text-generation"
+
+
+def run_model_for_task_caching(tasks: List[str], cache_requests: str):
+    eval_logger.info(f"Loading HF model: {MODEL}")
+
+    trans_pipe = trans_pipeline(
+        task=TASK, model=MODEL, device=DEVICE, trust_remote_code=True
+    )
+
+    model = trans_pipe.model
+    tokenizer = trans_pipe.tokenizer
+
+    eval_logger.info(
+        f"Running simple_evaluate to cache request objects for tasks: {tasks}"
+    )
+
+    cache_args = request_caching_arg_to_dict(cache_requests=cache_requests)
+
+    eval_logger.info(
+        f"The following operations will be performed on the cache: {cache_requests}"
+    )
+
+    eval_data = simple_evaluate(
+        model="hf-auto",
+        model_args={
+            "pretrained": model,
+            "tokenizer": tokenizer,
+        },
+        limit=1,
+        device=DEVICE,
+        tasks=tasks,
+        write_out=True,
+        **cache_args,
+    )
+
+    return eval_data
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tasks",
+        "-t",
+        default=None,
+        metavar="task1,task2",
+    )
+    parser.add_argument(
+        "--cache_requests",
+        type=str,
+        default=None,
+        choices=["true", "refresh", "delete"],
+        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
+    )
+
+    args = parser.parse_args()
+
+    tasks = args.tasks.split(",")
+
+    eval_data = run_model_for_task_caching(
+        tasks=tasks, model=MODEL, device=DEVICE, cache_requests=args.cache_requests
+    )
diff --git a/scripts/yans/lm-evaluation-harness/scripts/write_out.py b/scripts/yans/lm-evaluation-harness/scripts/write_out.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ff5a4304ed7798f8e375abeb8a5f30cb2aedcea
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/write_out.py
@@ -0,0 +1,97 @@
+import argparse
+import os
+import random
+
+import numpy as np
+
+from lm_eval import tasks
+from lm_eval.tasks import TaskManager
+from lm_eval.utils import eval_logger, join_iters
+
+
+EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_base_path", "--output_path", required=True)
+    parser.add_argument("--tasks", default="all_tasks")
+    parser.add_argument("--sets", type=str, default="val")  # example: val,test
+    parser.add_argument("--num_fewshot", type=int, default=1)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num_examples", type=int, default=1)
+    parser.add_argument(
+        "--include_path",
+        type=str,
+        default=None,
+        help="Additional path to include if there are external tasks to include.",
+    )
+    parser.add_argument(
+        "--verbosity",
+        type=str,
+        default="INFO",
+        help="Log error when tasks are not registered.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    np.random.seed(args.seed)
+
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+
+    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
+
+    if args.tasks == "all_tasks":
+        task_names = task_manager.all_tasks
+    else:
+        task_names = args.tasks.split(",")
+    task_dict = tasks.get_task_dict(task_names, task_manager)
+
+    os.makedirs(args.output_base_path, exist_ok=True)
+    for task_name, task in task_dict.items():
+        if isinstance(task, tuple):
+            _, task = task
+        rnd = random.Random()
+        rnd.seed(args.seed)
+
+        iters = []
+
+        for set in args.sets.split(","):
+            docs = None
+            if set == "train" and task.has_training_docs():
+                docs = task.training_docs()
+            if set == "val" and task.has_validation_docs():
+                docs = task.validation_docs()
+            if set == "test" and task.has_test_docs():
+                docs = task.test_docs()
+            if docs is not None:
+                iters.append(docs)
+
+        if len(iters) == 0:
+            raise ValueError(
+                f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value."
+            )
+
+        docs = join_iters(iters)
+
+        with open(
+            os.path.join(args.output_base_path, task_name), "w", encoding="utf8"
+        ) as f:
+            for i, doc in (
+                zip(range(args.num_examples), docs)
+                if args.num_examples > 0
+                else enumerate(docs)
+            ):
+                f.write(EXAMPLE_DIVIDER.format(i=i))
+                ctx = task.fewshot_context(
+                    doc=doc,
+                    num_fewshot=args.num_fewshot,
+                )
+                f.write(ctx + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/yans/lm-evaluation-harness/scripts/zeno_visualize.py b/scripts/yans/lm-evaluation-harness/scripts/zeno_visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2772a235579b64cb05353a950a716e104a44cb2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/scripts/zeno_visualize.py
@@ -0,0 +1,242 @@
+import argparse
+import json
+import os
+import re
+from pathlib import Path
+
+import pandas as pd
+from zeno_client import ZenoClient, ZenoMetric
+
+from lm_eval.utils import (
+    eval_logger,
+    get_latest_filename,
+    get_results_filenames,
+    get_sample_results_filenames,
+)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."
+    )
+    parser.add_argument(
+        "--data_path",
+        required=True,
+        help="Where to find the results of the benchmarks that have been run. Uses the name of each subfolder as the model name.",
+    )
+    parser.add_argument(
+        "--project_name",
+        required=True,
+        help="The name of the generated Zeno project.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    """Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
+
+    This scripts expects your results to live in a data folder where subfolders contain results of individual models.
+    """
+    args = parse_args()
+
+    client = ZenoClient(os.environ["ZENO_API_KEY"])
+
+    # Get all model subfolders from the parent data folder.
+    models = [
+        os.path.basename(os.path.normpath(f))
+        for f in os.scandir(Path(args.data_path))
+        if f.is_dir()
+    ]
+
+    assert len(models) > 0, "No model directories found in the data_path."
+
+    # Get the tasks from the latest results file of the first model.
+    tasks = set(tasks_for_model(models[0], args.data_path))
+
+    # Get tasks names from the latest results file for each model
+    # Get intersection of tasks for all models
+    for model in models:
+        old_tasks = tasks.copy()
+        task_count = len(tasks)
+        model_tasks = set(tasks_for_model(model, args.data_path))
+        tasks.intersection(set(model_tasks))
+
+        if task_count != len(tasks):
+            eval_logger.warning(
+                f"All models must have the same tasks. {model} has tasks: {model_tasks} but have already recorded tasks: {old_tasks}. Taking intersection {tasks}"
+            )
+
+    assert (
+        len(tasks) > 0
+    ), "Must provide at least one task in common amongst models to compare."
+
+    for task in tasks:
+        # Upload data for all models
+        for model_index, model in enumerate(models):
+            # Get latest results and sample results for a model
+            model_dir = Path(args.data_path, model)
+            model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+            model_results_filenames = get_results_filenames(model_files)
+            model_sample_filenames = get_sample_results_filenames(model_files)
+            latest_results = get_latest_filename(
+                [Path(f).name for f in model_results_filenames]
+            )
+            latest_sample_results = get_latest_filename(
+                [Path(f).name for f in model_sample_filenames if task in f]
+            )
+            model_args = re.sub(
+                r"[\"<>:/\|\\?\*\[\]]+",
+                "__",
+                json.load(
+                    open(Path(args.data_path, model, latest_results), encoding="utf-8")
+                )["config"]["model_args"],
+            )
+            print(model_args)
+            data = []
+            with open(
+                Path(args.data_path, model, latest_sample_results),
+                "r",
+                encoding="utf-8",
+            ) as file:
+                for line in file:
+                    data.append(json.loads(line.strip()))
+
+            configs = json.load(
+                open(Path(args.data_path, model, latest_results), encoding="utf-8")
+            )["configs"]
+            config = configs[task]
+
+            if model_index == 0:  # Only need to assemble data for the first model
+                metrics = []
+                for metric in config["metric_list"]:
+                    metrics.append(
+                        ZenoMetric(
+                            name=metric["metric"],
+                            type="mean",
+                            columns=[metric["metric"]],
+                        )
+                    )
+                project = client.create_project(
+                    name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
+                    view="text-classification",
+                    metrics=metrics,
+                )
+                project.upload_dataset(
+                    generate_dataset(data, config),
+                    id_column="id",
+                    data_column="data",
+                    label_column="labels",
+                )
+
+            project.upload_system(
+                generate_system_df(data, config),
+                name=model,
+                id_column="id",
+                output_column="output",
+            )
+
+
+def tasks_for_model(model: str, data_path: str):
+    """Get the tasks for a specific model.
+
+    Args:
+        model (str): The name of the model.
+        data_path (str): The path to the data.
+
+    Returns:
+        list: A list of tasks for the model.
+    """
+    # get latest model results for a given name
+    model_dir = Path(data_path, model)
+    model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+    model_results_filenames = get_results_filenames(model_files)
+    latest_results = get_latest_filename(model_results_filenames)
+    config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
+    return list(config[0].keys())
+
+
+def generate_dataset(
+    data,
+    config,
+):
+    """Generate a Zeno dataset from evaluation data.
+
+    Args:
+        data: The data to generate a dataset for.
+        config: The configuration of the task.
+
+    Returns:
+        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
+    """
+    ids = [x["doc_id"] for x in data]
+    labels = [x["target"] for x in data]
+    instance = [""] * len(ids)
+
+    if config["output_type"] == "loglikelihood":
+        instance = [x["arguments"][0][0] for x in data]
+        labels = [x["arguments"][0][1] for x in data]
+    elif config["output_type"] == "multiple_choice":
+        instance = [
+            x["arguments"][0][0]
+            + "\n\n"
+            + "\n".join([f"- {y[1]}" for y in x["arguments"]])
+            for x in data
+        ]
+    elif config["output_type"] == "loglikelihood_rolling":
+        instance = [x["arguments"][0][0] for x in data]
+    elif config["output_type"] == "generate_until":
+        instance = [x["arguments"][0][0] for x in data]
+
+    return pd.DataFrame(
+        {
+            "id": ids,
+            "data": instance,
+            "input_len": [len(x) for x in instance],
+            "labels": labels,
+            "output_type": config["output_type"],
+        }
+    )
+
+
+def generate_system_df(data, config):
+    """Generate a dataframe for a specific system to be uploaded to Zeno.
+
+    Args:
+        data: The data to generate a dataframe from.
+        config: The configuration of the task.
+
+    Returns:
+        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
+    """
+    ids = [x["doc_id"] for x in data]
+    system_dict = {"id": ids}
+    system_dict["output"] = [""] * len(ids)
+
+    if config["output_type"] == "loglikelihood":
+        system_dict["output"] = [
+            "correct" if x["filtered_resps"][0][1] is True else "incorrect"
+            for x in data
+        ]
+    elif config["output_type"] == "multiple_choice":
+        system_dict["output"] = [
+            ", ".join([str(y[0]) for y in x["filtered_resps"]]) for x in data
+        ]
+        system_dict["num_answers"] = [len(x["filtered_resps"]) for x in data]
+    elif config["output_type"] == "loglikelihood_rolling":
+        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
+    elif config["output_type"] == "generate_until":
+        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
+        system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]
+
+    metrics = {}
+    for metric in config["metric_list"]:
+        if "aggregation" in metric and metric["aggregation"] == "mean":
+            metrics[metric["metric"]] = [x[metric["metric"]] for x in data]
+
+    system_dict.update(metrics)
+    system_df = pd.DataFrame(system_dict)
+    return system_df
+
+
+if __name__ == "__main__":
+    main()