BryanW commited on Mar 23

Commit

d2253eb

verified ·

1 Parent(s): 33030f3

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

Koala-36M-v1/.gitattributes +68 -0
Prism/LICENSE +201 -0
Prism/LLaDA/LLaDA_Baseline/.gitignore +210 -0
Prism/LLaDA/LLaDA_Baseline/LICENSE +21 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/__init__.py +7 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/__main__.py +527 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/evaluator.py +765 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/evaluator_utils.py +554 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/__init__.py +25 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/custom.py +17 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/decontamination.py +25 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/extraction.py +233 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/selection.py +61 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/transformation.py +122 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/utils.py +552 -0
Prism/LLaDA/LLaDA_Baseline/evaluation_script.py +21 -0
Prism/LLaDA/LLaDA_Baseline/metrics/gsm8k_all.py +286 -0
Prism/LLaDA/LLaDA_Baseline/metrics/humaneval_all.py +183 -0
Prism/LLaDA/LLaDA_Baseline/metrics/math500_all.py +213 -0
Prism/LLaDA/LLaDA_Baseline/metrics/mbpp_all.py +194 -0
Prism/LLaDA/LLaDA_Baseline/requirements.txt +9 -0
Prism/LLaDA/LLaDA_Baseline/scripts/run_gsm8k.sh +32 -0
Prism/LLaDA/LLaDA_Baseline/scripts/run_humaneval.sh +29 -0
Prism/LLaDA/LLaDA_Baseline/scripts/run_math500.sh +29 -0
Prism/LLaDA/LLaDA_Baseline/scripts/run_mbpp.sh +29 -0
Prism/LLaDA/LLaDA_Prism/.gitignore +210 -0
Prism/LLaDA/LLaDA_Prism/LICENSE +21 -0
Prism/LLaDA/LLaDA_Prism/evaluation_script.py +21 -0
Prism/LLaDA/LLaDA_Prism/requirements.txt +9 -0
Prism/LLaDA/LLaDA_Truthfulqa/.gitignore +3 -0
Prism/LLaDA/LLaDA_Truthfulqa/LICENSE +201 -0
Prism/LLaDA/LLaDA_Truthfulqa/eval_llada.py +413 -0
Prism/LLaDA/LLaDA_Truthfulqa/eval_llada_prism.py +333 -0
Prism/README.md +107 -0
URSA-1.7B/.gitattributes +37 -0
URSA-1.7B/.gitignore +55 -0
URSA-1.7B/LICENSE +176 -0
URSA-1.7B/README.md +117 -0
URSA-1.7B/model_index.json +19 -0
URSA/.flake8 +21 -0
URSA/.gitignore +55 -0
URSA/=4.57.1 +70 -0
URSA/LICENSE +176 -0
URSA/README.md +191 -0
URSA/inference.py +71 -0
URSA/pyproject.toml +3 -0
URSA/requirements.txt +10 -0
URSA/setup.py +133 -0
URSA/ursa.jpg +0 -0
URSA/version.txt +1 -0

Koala-36M-v1/.gitattributes ADDED Viewed

	@@ -0,0 +1,68 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+# Video files - compressed
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.webm filter=lfs diff=lfs merge=lfs -text
+Koala_36M_1.csv filter=lfs diff=lfs merge=lfs -text
+Koala_36M_2.csv filter=lfs diff=lfs merge=lfs -text
+Koala_36M_3.csv filter=lfs diff=lfs merge=lfs -text
+Koala_36M_4.csv filter=lfs diff=lfs merge=lfs -text
+Koala_36M_5.csv filter=lfs diff=lfs merge=lfs -text
+Koala_36M_6.csv filter=lfs diff=lfs merge=lfs -text
+Koala_36M_7.csv filter=lfs diff=lfs merge=lfs -text
+Koala_36M_8.csv filter=lfs diff=lfs merge=lfs -text
+Koala_36M_9.csv filter=lfs diff=lfs merge=lfs -text
+Koala_36M_10.csv filter=lfs diff=lfs merge=lfs -text

Prism/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Prism/LLaDA/LLaDA_Baseline/.gitignore ADDED Viewed

	@@ -0,0 +1,210 @@

+*.jsonl
+*.json
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/

Prism/LLaDA/LLaDA_Baseline/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 preordinary
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Prism/LLaDA/LLaDA_Baseline/dllm_eval/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import logging
+import os
+from .evaluator import evaluate, simple_evaluate
+__version__ = "0.4.9"

Prism/LLaDA/LLaDA_Baseline/dllm_eval/__main__.py ADDED Viewed

	@@ -0,0 +1,527 @@

+import argparse
+import json
+import logging
+import os
+import sys
+from functools import partial
+from pathlib import Path
+from typing import Union
+from dllm_eval import evaluator, utils
+from dllm_eval.evaluator import request_caching_arg_to_dict
+from dllm_eval.loggers import EvaluationTracker, WandbLogger
+from dllm_eval.tasks import TaskManager
+from dllm_eval.utils import (
+    handle_non_serializable,
+    make_table,
+    simple_parse_args_string,
+)
+def try_parse_json(value: str) -> Union[str, dict, None]:
+    if value is None:
+        return None
+    try:
+        return json.loads(value)
+    except json.JSONDecodeError:
+        if "{" in value:
+            raise argparse.ArgumentTypeError(
+                f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings."
+            )
+        return value
+def _int_or_none_list_arg_type(
+    min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
+):
+    def parse_value(item):
+        item = item.strip().lower()
+        if item == "none":
+            return None
+        try:
+            return int(item)
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")
+    items = [parse_value(v) for v in value.split(split_char)]
+    num_items = len(items)
+    if num_items == 1:
+        # Makes downstream handling the same for single and multiple values
+        items = items * max_len
+    elif num_items < min_len or num_items > max_len:
+        raise argparse.ArgumentTypeError(
+            f"Argument requires {max_len} integers or None, separated by '{split_char}'"
+        )
+    elif num_items != max_len:
+        logging.warning(
+            f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
+            "Missing values will be filled with defaults."
+        )
+        default_items = [parse_value(v) for v in defaults.split(split_char)]
+        items.extend(
+            default_items[num_items:]
+        )  # extend items list with missing defaults
+    return items
+def check_argument_types(parser: argparse.ArgumentParser):
+    """
+    Check to make sure all CLI args are typed, raises error if not
+    """
+    for action in parser._actions:
+        if action.dest != "help" and not action.const:
+            if action.type is None:
+                raise ValueError(
+                    f"Argument '{action.dest}' doesn't have a type specified."
+                )
+            else:
+                continue
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
+    )
+    parser.add_argument(
+        "--tasks",
+        "-t",
+        default=None,
+        type=str,
+        metavar="task1,task2",
+        help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
+    )
+    parser.add_argument(
+        "--model_args",
+        "-a",
+        default="",
+        type=try_parse_json,
+        help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'""",
+    )
+    parser.add_argument(
+        "--num_fewshot",
+        "-f",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Number of examples in few-shot context",
+    )
+    parser.add_argument(
+        "--batch_size",
+        "-b",
+        type=str,
+        default=1,
+        metavar="auto|auto:N|N",
+        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Maximal batch size to try with --batch_size auto.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to use (e.g. cuda, cuda:0, cpu).",
+    )
+    parser.add_argument(
+        "--output_path",
+        "-o",
+        default=None,
+        type=str,
+        metavar="DIR|DIR/file.json",
+        help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
+    )
+    parser.add_argument(
+        "--limit",
+        "-L",
+        type=float,
+        default=None,
+        metavar="N|0<N<1",
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
+    parser.add_argument(
+        "--samples",
+        "-E",
+        default=None,
+        type=str,
+        metavar="/path/to/json",
+        help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
+    )
+    parser.add_argument(
+        "--use_cache",
+        "-c",
+        type=str,
+        default=None,
+        metavar="DIR",
+        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
+    )
+    parser.add_argument(
+        "--cache_requests",
+        type=str,
+        default=None,
+        choices=["true", "refresh", "delete"],
+        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
+    )
+    parser.add_argument(
+        "--check_integrity",
+        action="store_true",
+        help="Whether to run the relevant part of the test suite for the tasks.",
+    )
+    parser.add_argument(
+        "--write_out",
+        "-w",
+        action="store_true",
+        default=False,
+        help="Prints the prompt for the first few documents.",
+    )
+    parser.add_argument(
+        "--log_samples",
+        "-s",
+        action="store_true",
+        default=False,
+        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
+    )
+    parser.add_argument(
+        "--system_instruction",
+        type=str,
+        default=None,
+        help="System instruction to be used in the prompt",
+    )
+    parser.add_argument(
+        "--apply_chat_template",
+        type=str,
+        nargs="?",
+        const=True,
+        default=False,
+        help=(
+            "If True, apply chat template to the prompt. "
+            "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
+            "To apply a specific template from the available list of templates, provide the template name as an argument. "
+            "E.g. `--apply_chat_template template_name`"
+        ),
+    )
+    parser.add_argument(
+        "--fewshot_as_multiturn",
+        action="store_true",
+        default=False,
+        help="If True, uses the fewshot as a multi-turn conversation",
+    )
+    parser.add_argument(
+        "--show_config",
+        action="store_true",
+        default=False,
+        help="If True, shows the the full config of all tasks at the end of the evaluation.",
+    )
+    parser.add_argument(
+        "--include_path",
+        type=str,
+        default=None,
+        metavar="DIR",
+        help="Additional path to include if there are external tasks to include.",
+    )
+    parser.add_argument(
+        "--gen_kwargs",
+        type=try_parse_json,
+        default=None,
+        help=(
+            "Either comma delimited string or JSON formatted arguments for model generation on greedy_until tasks,"
+            """ e.g. '{"temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
+        ),
+    )
+    parser.add_argument(
+        "--verbosity",
+        "-v",
+        type=str.upper,
+        default=None,
+        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
+        help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
+    )
+    parser.add_argument(
+        "--wandb_args",
+        type=str,
+        default="",
+        help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
+    )
+    parser.add_argument(
+        "--wandb_config_args",
+        type=str,
+        default="",
+        help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3",
+    )
+    parser.add_argument(
+        "--hf_hub_log_args",
+        type=str,
+        default="",
+        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
+    )
+    parser.add_argument(
+        "--predict_only",
+        "-x",
+        action="store_true",
+        default=False,
+        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
+    )
+    default_seed_string = "0,1234,1234,1234"
+    parser.add_argument(
+        "--seed",
+        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
+        default=default_seed_string,  # for backward compatibility
+        help=(
+            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
+            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
+            "respectively, or a single integer to set the same seed for all four.\n"
+            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
+            "(for backward compatibility).\n"
+            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
+            "Here numpy's seed is not set since the second value is `None`.\n"
+            "E.g, `--seed 42` sets all four seeds to 42."
+        ),
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
+    )
+    parser.add_argument(
+        "--confirm_run_unsafe_code",
+        action="store_true",
+        help="Confirm that you understand the risks of running unsafe code for tasks that require it",
+    )
+    parser.add_argument(
+        "--metadata",
+        type=json.loads,
+        default=None,
+        help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",
+    )
+    return parser
+def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
+    check_argument_types(parser)
+    return parser.parse_args()
+def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
+    if not args:
+        # we allow for args to be passed externally, else we parse them ourselves
+        parser = setup_parser()
+        args = parse_eval_args(parser)
+    if args.wandb_args:
+        wandb_args_dict = simple_parse_args_string(args.wandb_args)
+        wandb_config_args_dict = simple_parse_args_string(args.wandb_config_args)
+        wandb_logger = WandbLogger(wandb_args_dict, wandb_config_args_dict)
+    utils.setup_logging(args.verbosity)
+    eval_logger = logging.getLogger(__name__)
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    # update the evaluation tracker args with the output path and the HF token
+    if args.output_path:
+        args.hf_hub_log_args += f",output_path={args.output_path}"
+    if os.environ.get("HF_TOKEN", None):
+        args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
+    evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
+    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
+    if args.predict_only:
+        args.log_samples = True
+    if (args.log_samples or args.predict_only) and not args.output_path:
+        raise ValueError(
+            "Specify --output_path if providing --log_samples or --predict_only"
+        )
+    if args.fewshot_as_multiturn and args.apply_chat_template is False:
+        raise ValueError(
+            "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
+        )
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+    metadata = (
+        simple_parse_args_string(args.model_args)
+        if isinstance(args.model_args, str)
+        else args.model_args
+        if isinstance(args.model_args, dict)
+        else {}
+    ) | (
+        args.metadata
+        if isinstance(args.metadata, dict)
+        else simple_parse_args_string(args.metadata)
+    )
+    task_manager = TaskManager(include_path=args.include_path, metadata=metadata)
+    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
+        eval_logger.warning(
+            "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
+        )
+    if args.limit:
+        eval_logger.warning(
+            " --limit SHOULD ONLY BE USED FOR TESTING."
+            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+    if args.samples:
+        assert args.limit is None, (
+            "If --samples is not None, then --limit must be None."
+        )
+        if (samples := Path(args.samples)).is_file():
+            args.samples = json.loads(samples.read_text())
+        else:
+            args.samples = json.loads(args.samples)
+    if args.tasks is None:
+        eval_logger.error("Need to specify task to evaluate.")
+        sys.exit()
+    elif args.tasks == "list":
+        print(task_manager.list_all_tasks())
+        sys.exit()
+    elif args.tasks == "list_groups":
+        print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
+        sys.exit()
+    elif args.tasks == "list_tags":
+        print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
+        sys.exit()
+    elif args.tasks == "list_subtasks":
+        print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
+        sys.exit()
+    else:
+        if os.path.isdir(args.tasks):
+            import glob
+            task_names = []
+            yaml_path = os.path.join(args.tasks, "*.yaml")
+            for yaml_file in glob.glob(yaml_path):
+                config = utils.load_yaml_config(yaml_file)
+                task_names.append(config)
+        else:
+            task_list = args.tasks.split(",")
+            task_names = task_manager.match_tasks(task_list)
+            for task in [task for task in task_list if task not in task_names]:
+                if os.path.isfile(task):
+                    config = utils.load_yaml_config(task)
+                    task_names.append(config)
+            task_missing = [
+                task for task in task_list if task not in task_names and "*" not in task
+            ]  # we don't want errors if a wildcard ("*") task name was used
+            if task_missing:
+                missing = ", ".join(task_missing)
+                eval_logger.error(
+                    f"Tasks were not found: {missing}\n"
+                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
+                )
+                raise ValueError(
+                    f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
+                )
+    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
+    if args.trust_remote_code:
+        eval_logger.info(
+            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
+        )
+        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
+        # because it's already been determined based on the prior env var before launching our
+        # script--`datasets` gets imported by dllm_eval internally before these lines can update the env.
+        import datasets
+        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+        args.model_args = args.model_args + ",trust_remote_code=True"
+    (
+        eval_logger.info(f"Selected Tasks: {task_names}")
+        if eval_logger.getEffectiveLevel() >= logging.INFO
+        else print(f"Selected Tasks: {task_names}")
+    )
+    request_caching_args = request_caching_arg_to_dict(
+        cache_requests=args.cache_requests
+    )
+    results = evaluator.simple_evaluate(
+        model=args.model,
+        model_args=args.model_args,
+        tasks=task_names,
+        num_fewshot=args.num_fewshot,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        device=args.device,
+        use_cache=args.use_cache,
+        limit=args.limit,
+        samples=args.samples,
+        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        log_samples=args.log_samples,
+        evaluation_tracker=evaluation_tracker,
+        system_instruction=args.system_instruction,
+        apply_chat_template=args.apply_chat_template,
+        fewshot_as_multiturn=args.fewshot_as_multiturn,
+        gen_kwargs=args.gen_kwargs,
+        task_manager=task_manager,
+        predict_only=args.predict_only,
+        random_seed=args.seed[0],
+        numpy_random_seed=args.seed[1],
+        torch_random_seed=args.seed[2],
+        fewshot_random_seed=args.seed[3],
+        confirm_run_unsafe_code=args.confirm_run_unsafe_code,
+        metadata=metadata,
+        **request_caching_args,
+    )
+    if results is not None:
+        if args.log_samples:
+            samples = results.pop("samples")
+        dumped = json.dumps(
+            results, indent=2, default=handle_non_serializable, ensure_ascii=False
+        )
+        if args.show_config:
+            print(dumped)
+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
+        # Add W&B logging
+        if args.wandb_args:
+            try:
+                wandb_logger.post_init(results)
+                wandb_logger.log_eval_result()
+                if args.log_samples:
+                    wandb_logger.log_eval_samples(samples)
+            except Exception as e:
+                eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
+        evaluation_tracker.save_results_aggregated(
+            results=results, samples=samples if args.log_samples else None
+        )
+        if args.log_samples:
+            for task_name, config in results["configs"].items():
+                evaluation_tracker.save_results_samples(
+                    task_name=task_name, samples=samples[task_name]
+                )
+        if (
+            evaluation_tracker.push_results_to_hub
+            or evaluation_tracker.push_samples_to_hub
+        ):
+            evaluation_tracker.recreate_metadata_card()
+        print(
+            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
+            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
+        )
+        if args.wandb_args:
+            # Tear down wandb run once all the logging is done.
+            wandb_logger.run.finish()
+if __name__ == "__main__":
+    cli_evaluate()

Prism/LLaDA/LLaDA_Baseline/dllm_eval/evaluator.py ADDED Viewed

	@@ -0,0 +1,765 @@

+import itertools
+import json
+import logging
+import random
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, List, Optional, Union
+import numpy as np
+import torch
+import dllm_eval.api.metrics
+import dllm_eval.api.registry
+import dllm_eval.api.task
+import dllm_eval.models
+from dllm_eval.caching.cache import delete_cache
+from dllm_eval.evaluator_utils import (
+    consolidate_group_results,
+    consolidate_results,
+    get_sample_size,
+    get_subtask_list,
+    get_task_list,
+    prepare_print_tasks,
+    print_writeout,
+    run_task_tests,
+)
+from dllm_eval.loggers import EvaluationTracker
+from dllm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
+from dllm_eval.tasks import TaskManager, get_task_dict
+from dllm_eval.utils import (
+    handle_non_serializable,
+    hash_string,
+    positional_deprecated,
+    setup_logging,
+    simple_parse_args_string,
+)
+if TYPE_CHECKING:
+    from dllm_eval.api.model import LM
+    from dllm_eval.api.task import Task
+eval_logger = logging.getLogger(__name__)
+@positional_deprecated
+def simple_evaluate(
+    model,
+    model_args: Optional[Union[str, dict]] = None,
+    tasks: Optional[List[Union[str, dict, object]]] = None,
+    num_fewshot: Optional[int] = None,
+    batch_size: Optional[Union[int, str]] = None,
+    max_batch_size: Optional[int] = None,
+    device: Optional[str] = None,
+    use_cache: Optional[str] = None,
+    cache_requests: bool = False,
+    rewrite_requests_cache: bool = False,
+    delete_requests_cache: bool = False,
+    limit: Optional[Union[int, float]] = None,
+    samples: Optional[dict] = None,
+    bootstrap_iters: int = 100000,
+    check_integrity: bool = False,
+    write_out: bool = False,
+    log_samples: bool = True,
+    evaluation_tracker: Optional[EvaluationTracker] = None,
+    system_instruction: Optional[str] = None,
+    apply_chat_template: Union[bool, str] = False,
+    fewshot_as_multiturn: bool = False,
+    gen_kwargs: Union[str, dict, None] = None,
+    task_manager: Optional[TaskManager] = None,
+    verbosity=None,
+    predict_only: bool = False,
+    random_seed: int = 0,
+    numpy_random_seed: int = 1234,
+    torch_random_seed: int = 1234,
+    fewshot_random_seed: int = 1234,
+    confirm_run_unsafe_code: bool = False,
+    metadata: Optional[dict] = None,
+):
+    """Instantiate and evaluate a model on a list of tasks.
+    :param model: Union[str, LM]
+        Name of model or LM object, see dllm_eval.models.get_model
+    :param model_args: Optional[str, dict]
+        String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object.
+        Ignored if `model` argument is a LM object.
+    :param tasks: list[Union[str, dict, Task]]
+        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
+    :param num_fewshot: int
+        Number of examples in few-shot context
+    :param batch_size: int or str, optional
+        Batch size for model
+    :param max_batch_size: int, optional
+        Maximal batch size to try with automatic batch size detection
+    :param device: str, optional
+        PyTorch device (e.g. "cpu" or "cuda:0") for running models
+    :param use_cache: str, optional
+        A path to a sqlite db file for caching model responses. `None` if not caching.
+    :param cache_requests: bool, optional
+        Speed up evaluation by caching the building of dataset requests. `None` if not caching.
+    :param rewrite_requests_cache: bool, optional
+        Rewrites all the request cache if set to `True`. `None` if not desired.
+    :param delete_requests_cache: bool, optional
+        Deletes all the request cache if set to `True`. `None` if not desired.
+    :param limit: int or float, optional
+        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
+    :param samples: dictionary, optional
+        Dictionary indicating which examples should be tested in each task, e.g., {"mmlu_astronomy":[0,3,6],"mmlu_anatomy":[1,4,7,10]}.
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed.
+    :param check_integrity: bool
+        Whether to run the relevant part of the test suite for the tasks
+    :param write_out: bool
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param system_instruction: str
+        System instruction to be applied to the prompt
+    :param apply_chat_template: Union[bool, str]
+        Specifies whether to apply a chat template to the prompt.
+        - If set to True, the default chat template is applied.
+        - If set to a string, applies the specified chat template by name.
+        Defaults to False (no chat template applied).
+    :param fewshot_as_multiturn: bool
+        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+    :param gen_kwargs: dict or comma-separated string
+        Arguments for model generation
+        Ignored for all tasks with loglikelihood output_type
+    :param verbosity: str
+        Verbosity level for logging
+    :param predict_only: bool
+        If true only model outputs will be generated and returned. Metrics will not be evaluated
+    :param random_seed: int
+        Random seed for python's random module. If set to None, the seed will not be set.
+    :param numpy_random_seed: int
+        Random seed for numpy. If set to None, the seed will not be set.
+    :param torch_random_seed: int
+        Random seed for torch. If set to None, the seed will not be set.
+    :param fewshot_random_seed: int
+        Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
+    :param metadata: dict
+        Additional metadata to be added to the task manager. Will get passed to the download function of the task.
+    return
+        Dictionary of results
+    """
+    if verbosity is not None:
+        setup_logging(verbosity=verbosity)
+    start_date = time.time()
+    if limit is not None and samples is not None:
+        raise ValueError(
+            "Either 'limit' or 'samples' must be None, but both are not None."
+        )
+    if (
+        (isinstance(model_args, str) and "inst" in model_args.lower())
+        or (
+            isinstance(model_args, dict)
+            and any("inst" in str(v).lower() for v in model_args.values())
+        )
+    ) and not apply_chat_template:
+        eval_logger.warning(
+            "Model appears to be an instruct variant but chat template is not applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
+        )
+    if delete_requests_cache:
+        eval_logger.info("Deleting requests cache...")
+        delete_cache()
+    seed_message = []
+    if random_seed is not None:
+        # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
+        seed_message.append(f"Setting random seed to {random_seed}")
+        random.seed(random_seed)
+    if numpy_random_seed is not None:
+        seed_message.append(f"Setting numpy seed to {numpy_random_seed}")
+        np.random.seed(numpy_random_seed)
+    if torch_random_seed is not None:
+        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
+        torch.manual_seed(torch_random_seed)
+    if fewshot_random_seed is not None:
+        seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
+    if seed_message:
+        eval_logger.info(" | ".join(seed_message))
+    if tasks is None:
+        tasks = []
+    if len(tasks) == 0:
+        raise ValueError(
+            "No tasks specified, or no tasks found. Please verify the task names."
+        )
+    if gen_kwargs is not None:
+        if isinstance(gen_kwargs, str):
+            gen_kwargs = simple_parse_args_string(gen_kwargs)
+        eval_logger.warning(
+            f"generation_kwargs: {gen_kwargs} specified through cli, these settings will update set parameters in yaml tasks. "
+            "Ensure 'do_sample=True' for non-greedy decoding!"
+        )
+        if not gen_kwargs:
+            gen_kwargs = None
+    if isinstance(model, str):
+        if model_args is None:
+            eval_logger.warning("model_args not specified. Using defaults.")
+            model_args = ""
+        if isinstance(model_args, dict):
+            eval_logger.info(
+                f"Initializing {model} model, with arguments: {model_args}"
+            )
+            lm = dllm_eval.api.registry.get_model(model).create_from_arg_obj(
+                model_args,
+                {
+                    "batch_size": batch_size,
+                    "max_batch_size": max_batch_size,
+                    "device": device,
+                },
+            )
+        else:
+            eval_logger.info(
+                f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
+            )
+            lm = dllm_eval.api.registry.get_model(model).create_from_arg_string(
+                model_args,
+                {
+                    "batch_size": batch_size,
+                    "max_batch_size": max_batch_size,
+                    "device": device,
+                },
+            )
+    else:
+        if not isinstance(model, dllm_eval.api.model.LM):
+            raise TypeError(
+                f"The value of `model` passed to simple_evaluate() was of type {type(model)}, but is required to be a subclass of dllm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `dllm_eval.models.huggingface.HFLM(pretrained=my_model)` first."
+            )
+        eval_logger.info("Using pre-initialized model")
+        lm = model
+    if use_cache is not None:
+        eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
+        lm = dllm_eval.api.model.CachingLM(
+            lm,
+            use_cache
+            # each rank receives a different cache db.
+            # necessary to avoid multiple writes to cache at once
+            + "_rank"
+            + str(lm.rank)
+            + ".db",
+        )
+    if task_manager is None:
+        metadata = (
+            simple_parse_args_string(model_args)
+            if isinstance(model_args, str)
+            else model_args
+            if isinstance(model_args, dict)
+            else {}
+        ) | (metadata or {})
+        task_manager = TaskManager(metadata=metadata)
+    task_dict = get_task_dict(
+        tasks,
+        task_manager,
+    )
+    # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
+    # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
+    def _adjust_config(task_dict):
+        adjusted_task_dict = {}
+        for task_name, task_obj in task_dict.items():
+            if isinstance(task_obj, dict):
+                adjusted_task_dict = {
+                    **adjusted_task_dict,
+                    **{task_name: _adjust_config(task_obj)},
+                }
+            else:
+                if task_obj.get_config("output_type") == "generate_until":
+                    if gen_kwargs is not None:
+                        task_obj.set_config(
+                            key="generation_kwargs", value=gen_kwargs, update=True
+                        )
+                    eval_logger.info(
+                        f"{task_obj.config.task}: Using gen_kwargs: {task_obj.config.generation_kwargs}"
+                    )
+                if predict_only:
+                    eval_logger.info(
+                        f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
+                    )
+                    # we have to change the class properties post-hoc. This is pretty hacky.
+                    task_obj.override_metric(metric_name="bypass")
+                # override tasks' fewshot values to the provided num_fewshot arg value
+                # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
+                if num_fewshot is not None:
+                    if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
+                        eval_logger.info(
+                            f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
+                        )
+                    else:
+                        eval_logger.warning(
+                            f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                        )
+                        task_obj.set_config(key="num_fewshot", value=num_fewshot)
+                else:
+                    # if num_fewshot not provided, and the task does not define a default one, default to 0
+                    if (
+                        default_num_fewshot := task_obj.get_config("num_fewshot")
+                    ) is None:
+                        task_obj.set_config(key="num_fewshot", value=0)
+                # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
+                task_obj.set_fewshot_seed(seed=fewshot_random_seed)
+                adjusted_task_dict[task_name] = task_obj
+        return adjusted_task_dict
+    task_dict = _adjust_config(task_dict)
+    if check_integrity:
+        run_task_tests(task_list=tasks)
+    if evaluation_tracker is not None:
+        evaluation_tracker.general_config_tracker.log_experiment_args(
+            model_source=model,
+            model_args=model_args,
+            system_instruction=system_instruction,
+            chat_template=lm.chat_template(apply_chat_template)
+            if apply_chat_template
+            else None,
+            fewshot_as_multiturn=fewshot_as_multiturn,
+        )
+    results = evaluate(
+        lm=lm,
+        task_dict=task_dict,
+        limit=limit,
+        samples=samples,
+        cache_requests=cache_requests,
+        rewrite_requests_cache=rewrite_requests_cache,
+        bootstrap_iters=bootstrap_iters,
+        write_out=write_out,
+        log_samples=True if predict_only else log_samples,
+        system_instruction=system_instruction,
+        apply_chat_template=apply_chat_template,
+        fewshot_as_multiturn=fewshot_as_multiturn,
+        verbosity=verbosity,
+        confirm_run_unsafe_code=confirm_run_unsafe_code,
+    )
+    if verbosity is not None:
+        setup_logging(verbosity=verbosity)
+    if lm.rank == 0:
+        if isinstance(model, str):
+            model_name = model
+        elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
+            model_name = model.config._name_or_path
+        else:
+            model_name = type(model).__name__
+        # add info about the model and few shot config
+        results["config"] = {
+            "model": model_name,
+            "model_args": model_args,
+        }
+        # add more detailed model info if available
+        if isinstance(lm, dllm_eval.models.huggingface.HFLM):
+            results["config"].update(lm.get_model_info())
+        # add info about execution
+        results["config"].update(
+            {
+                "batch_size": batch_size,
+                "batch_sizes": (
+                    list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
+                ),
+                "device": device,
+                "use_cache": use_cache,
+                "limit": limit,
+                "bootstrap_iters": bootstrap_iters,
+                "gen_kwargs": gen_kwargs,
+                "random_seed": random_seed,
+                "numpy_seed": numpy_random_seed,
+                "torch_seed": torch_random_seed,
+                "fewshot_seed": fewshot_random_seed,
+            }
+        )
+        results["git_hash"] = get_git_commit_hash()
+        results["date"] = start_date
+        add_env_info(results)  # additional environment info to results
+        add_tokenizer_info(results, lm)  # additional info about tokenizer
+        return results
+    else:
+        return None
+@positional_deprecated
+def evaluate(
+    lm: "LM",
+    task_dict,
+    limit: Optional[int] = None,
+    samples: Optional[dict] = None,
+    cache_requests: bool = False,
+    rewrite_requests_cache: bool = False,
+    bootstrap_iters: Optional[int] = 100000,
+    write_out: bool = False,
+    log_samples: bool = True,
+    system_instruction: Optional[str] = None,
+    apply_chat_template: Union[bool, str] = False,
+    fewshot_as_multiturn: bool = False,
+    verbosity: str = "INFO",
+    confirm_run_unsafe_code: bool = False,
+):
+    """Instantiate and evaluate a model on a list of tasks.
+    :param lm: obj
+        Language Model
+    :param task_dict: dict[str, Task]
+        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
+    :param limit: int, optional
+        Limit the number of examples per task (only use this for testing)
+    :param samples: dictionary, optional
+        Dictionary indicating which examples should be tested in each task, e.g., {"mmlu_astronomy":[0,3,6],"mmlu_anatomy":[1,4,7,10]}.
+    :param cache_requests: bool, optional
+        Speed up evaluation by caching the building of dataset requests.
+    :param rewrite_requests_cache: bool, optional
+        Rewrites all the request cache if set to `True`.
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations.
+    :param write_out: bool
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param system_instruction: str
+        System instruction to be applied to the prompt
+    :param apply_chat_template: Union[bool, str]
+        Specifies whether to apply a chat template to the prompt.
+        - If set to True, the default chat template is applied.
+        - If set to a string, applies the specified chat template by name.
+        Defaults to False (no chat template applied).
+    :param fewshot_as_multiturn: bool
+        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+    :param verbosity: str
+        Verbosity level for logging
+    :param confirm_run_unsafe_code: bool
+        Whether to confirm running tasks marked as unsafe.
+    :return
+        Dictionary of results
+    """
+    if limit is not None and samples is not None:
+        raise ValueError(
+            "Either 'limit' or 'samples' must be None, but both are not None."
+        )
+    if samples is not None:
+        eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}")
+    if apply_chat_template:
+        eval_logger.warning(
+            "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
+        )
+    # tracks all Instances/requests a model must generate output on.
+    requests = defaultdict(list)
+    # stores the amount to pad out reqs per req. type so that
+    # number of fwd passes per distributed rank is equal
+    padding_requests = defaultdict(int)
+    # get lists of group hierarchy and each type of request
+    eval_tasks = get_task_list(task_dict)
+    if not log_samples:
+        if not all(
+            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
+            for task_output in eval_tasks
+        ):
+            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
+    # validation checks:
+    # 1.are we running multimodal task <-> non-multimodal model class, or vice-versa.
+    # 2.are we running code that is marked as unsafe.
+    incompatible_tasks = []
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+        if getattr(task, "MULTIMODAL", False) and not getattr(lm, "MULTIMODAL", False):
+            incompatible_tasks.append(task_output.task_name)
+        elif getattr(task, "UNSAFE_CODE", False) and not confirm_run_unsafe_code:
+            raise ValueError(
+                f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task."
+            )
+    if len(incompatible_tasks) > 0:
+        if not getattr(lm, "MULTIMODAL", False):
+            raise ValueError(
+                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
+            )
+    # end validation check
+    # Cache the limit arg.
+    limit_arg = limit
+    limits = []
+    for task_output in eval_tasks:
+        task: Task = task_output.task
+        limit = get_sample_size(task, limit_arg)
+        limits.append(limit)
+        task.build_all_requests(
+            limit=limit,
+            samples=samples.get(task_output.task_name, None)
+            if samples is not None
+            else samples,
+            rank=lm.rank,
+            world_size=lm.world_size,
+            cache_requests=cache_requests,
+            rewrite_requests_cache=rewrite_requests_cache,
+            system_instruction=system_instruction,
+            apply_chat_template=bool(apply_chat_template),
+            fewshot_as_multiturn=fewshot_as_multiturn,
+            chat_template=getattr(lm, "apply_chat_template")
+            if apply_chat_template
+            else None,
+            tokenizer_name=getattr(lm, "tokenizer_name", "")
+            if apply_chat_template
+            else "",
+        )
+        eval_logger.debug(
+            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
+        )
+        if write_out:
+            print_writeout(task)
+        # aggregate Instances by LM method requested to get output.
+        for instance in task.instances:
+            reqtype = instance.request_type
+            requests[reqtype].append(instance)
+        if lm.world_size > 1:
+            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
+            gathered_item = (
+                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
+            )
+            # "multiple_choice" task types dispatch (several) "loglikelihood" request types
+            reqtype = (
+                "loglikelihood"
+                if task.OUTPUT_TYPE == "multiple_choice"
+                else task.OUTPUT_TYPE
+            )
+            # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks)
+            numpad = max(gathered_item) - gathered_item[lm.rank]
+            # todo: may not account for padding in cases like SquadV2 which has multiple req types
+            padding_requests[reqtype] += numpad
+    ### Run LM on inputs, get all outputs ###
+    # execute each type of request
+    for reqtype, reqs in requests.items():
+        eval_logger.info(f"Running {reqtype} requests")
+        # create `K` copies of each request `req` based off `K = req.repeats`
+        cloned_reqs = []
+        for req in reqs:
+            cloned_reqs.extend([req] * req.repeats)
+        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
+            for _ in range(padding_requests[reqtype]):
+                cloned_reqs.extend([req] * req.repeats)
+        # run requests through model
+        resps = getattr(lm, reqtype)(cloned_reqs)
+        # put responses from model into a list of length K for each request.
+        for x, req in zip(resps, cloned_reqs):
+            req.resps.append(x)
+        if lm.world_size > 1:
+            lm.accelerator.wait_for_everyone()
+    RANK = lm.rank
+    WORLD_SIZE = lm.world_size
+    ### Postprocess outputs ###
+    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
+    for task_output, limit in zip(eval_tasks, limits):
+        task = task_output.task
+        task.apply_filters()
+        ### Collect values of metrics on all datapoints ###
+        # # unpack results and sort back in order and return control to Task
+        # TODO: make it possible to use a different metric per filter
+        # Pre-process task.instances to group by doc_id
+        instances_by_doc_id = defaultdict(list)
+        for instance in task.instances:
+            instances_by_doc_id[instance.doc_id].append(instance)
+        # Sort instances within each group
+        for instances in instances_by_doc_id.values():
+            instances.sort(key=lambda x: x.idx)
+        # iterate over different filters used
+        for filter_key in task.instances[0].filtered_resps.keys():
+            indices = (
+                samples.get(task_output.task_name, None)
+                if samples is not None
+                else None
+            )
+            doc_iterator = task.doc_iterator(
+                rank=RANK,
+                limit=limit,
+                world_size=WORLD_SIZE,
+                samples=indices,
+            )
+            for doc_id, doc in doc_iterator:
+                if indices:
+                    doc_id_true = indices[doc_id]
+                else:
+                    doc_id_true = doc_id
+                requests = instances_by_doc_id[doc_id]
+                metrics = task.process_results(
+                    doc, [req.filtered_resps[filter_key] for req in requests]
+                )
+                if log_samples:
+                    target = task.doc_to_target(doc)
+                    example = {
+                        "doc_id": doc_id_true,
+                        "doc": doc,
+                        "target": target,
+                        "arguments": [req.args for req in requests],
+                        "resps": [req.resps for req in requests],
+                        "filtered_resps": [
+                            req.filtered_resps[filter_key] for req in requests
+                        ],
+                        "filter": filter_key,
+                        "metrics": list(metrics.keys()),
+                        "doc_hash": hash_string(
+                            json.dumps(
+                                requests[0].doc,
+                                indent=2,
+                                default=handle_non_serializable,
+                                ensure_ascii=False,
+                            )
+                        ),
+                        "prompt_hash": hash_string(requests[0].arguments[0]),
+                        "target_hash": hash_string(str(target)),
+                    }
+                    example.update(metrics)
+                    task_output.logged_samples.append(example)
+                for metric, value in metrics.items():
+                    task_output.sample_metrics[(metric, filter_key)].append(value)
+    if WORLD_SIZE > 1:
+        # if multigpu, then gather data across all ranks to rank 0
+        # first gather logged samples across all ranks
+        for task_output in eval_tasks:
+            if log_samples:
+                # for task_name, task_samples in list(samples.items()):
+                full_samples = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.logged_samples,
+                    object_gather_list=full_samples,
+                    dst=0,
+                )
+                if RANK == 0:
+                    task_output.logged_samples = list(
+                        itertools.chain.from_iterable(full_samples)
+                    )
+            # then collect metrics across all ranks
+            for metrics in task_output.sample_metrics:
+                metric_list = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.sample_metrics[metrics],
+                    object_gather_list=metric_list,
+                    dst=0,
+                )
+                if RANK == 0:
+                    task_output.sample_metrics[metrics] = list(
+                        itertools.chain.from_iterable(metric_list)
+                    )
+    if RANK == 0:
+        ### Aggregate results over all datapoints ###
+        # aggregate results ; run bootstrap CIs
+        for task_output in eval_tasks:
+            task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
+        (
+            results,
+            samples,
+            configs,
+            versions,
+            num_fewshot,
+            higher_is_better,
+        ) = consolidate_results(eval_tasks)
+        ### Calculate group metrics ###
+        if bool(results):
+            results, versions, show_group_table, *_ = consolidate_group_results(
+                results, versions, task_dict
+            )
+        results_agg, group_agg = prepare_print_tasks(task_dict, results)
+        subtask_list = get_subtask_list(task_dict)
+        # collect all higher_is_better values for metrics
+        # in the group's subtasks.
+        # TODO: clean this up ; unify with the below metric_list loop?
+        _higher_is_better = {}
+        for group, task_list in subtask_list.items():
+            if (
+                len(task_list) != 0
+            ):  # subtask list will list "task_name": [] for solo tasks
+                for task in task_list:
+                    for m, h in higher_is_better[task].items():
+                        if m not in _higher_is_better.keys():
+                            _higher_is_better[m] = h
+                        if (
+                            m in _higher_is_better
+                            and _higher_is_better[m] is not None
+                            and _higher_is_better[m] != h
+                        ):
+                            eval_logger.warning(
+                                f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
+                            )
+                            _higher_is_better[m] = None
+                higher_is_better[group] = _higher_is_better
+        results_dict = {
+            "results": dict(results_agg.items()),
+            **(
+                {"groups": dict(group_agg.items())}
+                if (bool(group_agg) & show_group_table)
+                else {}
+            ),
+            "group_subtasks": dict(reversed(subtask_list.items())),
+            "configs": dict(sorted(configs.items())),
+            "versions": dict(sorted(versions.items())),
+            "n-shot": dict(sorted(num_fewshot.items())),
+            "higher_is_better": dict(sorted(higher_is_better.items())),
+            "n-samples": {
+                task_output.task_name: {
+                    "original": len(task_output.task.eval_docs),
+                    "effective": min(
+                        limit if limit else len(task_output.task.eval_docs),
+                        len(task_output.task.eval_docs),
+                    ),
+                }
+                for task_output, limit in zip(eval_tasks, limits)
+            },
+        }
+        if log_samples:
+            results_dict["samples"] = dict(samples)
+        return results_dict
+    else:
+        return None
+def request_caching_arg_to_dict(cache_requests: str) -> dict:
+    request_caching_args = {
+        "cache_requests": cache_requests in {"true", "refresh"},
+        "rewrite_requests_cache": cache_requests == "refresh",
+        "delete_requests_cache": cache_requests == "delete",
+    }
+    return request_caching_args

Prism/LLaDA/LLaDA_Baseline/dllm_eval/evaluator_utils.py ADDED Viewed

	@@ -0,0 +1,554 @@

+import collections
+import logging
+import math
+import pathlib
+import sys
+from typing import List, Optional, Tuple, Union
+from dllm_eval.api.group import ConfigurableGroup
+from dllm_eval.api.metrics import (
+    aggregate_subtask_metrics,
+    mean,
+    pooled_sample_stderr,
+    stderr_for_metric,
+)
+from dllm_eval.api.task import Task
+from dllm_eval.utils import positional_deprecated
+eval_logger = logging.getLogger(__name__)
+class TaskOutput:
+    """
+    Wrapper class for Task outputs.It contains various attributes and methods to manage and calculate metrics for the task.
+        Attributes:
+            task (object): The task object.
+            task_name (str): The name of the task.
+            task_config (dict): The configuration of the task.
+            version (str): The version of the task.
+            group_name (str): The name of the task group.
+            n_shot (int): The number of shots for the task.
+            task_alias (str): The alias of the task.
+            group_alias (str): The alias of the task group.
+            is_group (bool): Indicates if the task is a group.
+            logged_samples (list): The list of logged samples.
+            sample_len (int): The length of the samples.
+            sample_metrics (defaultdict): The dictionary of samples' metrics.
+            agg_metrics (defaultdict): The dictionary of aggregate metrics.
+        Methods:
+            from_taskdict(cls, task_name: str, task):
+                Creates a TaskOutput instance from a task dictionary.
+            calculate_aggregate_metric(bootstrap_iters=100000) -> None:
+                Calculates the aggregate metrics for the task.
+    """
+    def __init__(
+        self,
+        task=None,
+        task_name=None,
+        task_config=None,
+        version=None,
+        group_name=None,
+        n_shot=None,
+        task_alias=None,
+        group_alias=None,
+        is_group=None,
+    ):
+        self.task = task
+        self.task_config = task_config
+        self.task_name = task_name
+        self.group_name = group_name
+        self.version = version
+        self.n_shot = n_shot
+        self.task_alias = task_alias
+        self.group_alias = group_alias
+        self.is_group = is_group
+        self.logged_samples = []
+        self.sample_len = None
+        self.sample_metrics = collections.defaultdict(list)
+        self.agg_metrics = collections.defaultdict(list)
+    @classmethod
+    def from_taskdict(cls, task_name: str, task):
+        if isinstance(task, tuple):
+            group_name, task = task
+        else:
+            group_name = None
+        if not task:
+            # these gets filtered out in get_task_list
+            # once they are added to group hierarchy
+            is_group = True
+            return cls(
+                task=task, task_name=task_name, is_group=is_group, group_name=group_name
+            )
+        version = task.VERSION
+        task_config = dict(task.dump_config())
+        if (n_shot := task_config.get("num_fewshot")) == 0:
+            n_shot = task_config.get("metadata", {}).get("num_fewshot", 0)
+        task_alias = task_config.get("alias")
+        group_alias = task_config.get("group_alias")
+        return cls(
+            task=task,
+            task_name=task_name,
+            task_config=task_config,
+            group_name=group_name,
+            version=version,
+            n_shot=n_shot,
+            task_alias=task_alias,
+            group_alias=group_alias,
+        )
+    def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None:
+        for (metric, filter_key), items in self.sample_metrics.items():
+            try:
+                agg_fn = self.task.aggregation()[metric]
+            except KeyError:
+                # This is when process results output an arbitrary metric
+                # TODO: Handle this better and allow other aggregate functions other than mean.
+                agg_fn = mean
+            metric_key = f"{metric},{filter_key}"
+            self.agg_metrics[metric_key] = agg_fn(items)
+            self.sample_len = len(items)  # TODO: same sample size for each metric?
+            if isinstance(bootstrap_iters, int):
+                stderr_fn = stderr_for_metric(
+                    metric=agg_fn,
+                    bootstrap_iters=min(bootstrap_iters, 100)
+                    if metric in ["bleu", "chrf", "ter"]
+                    else bootstrap_iters,
+                )
+                self.agg_metrics[f"{metric}_stderr,{filter_key}"] = (
+                    stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
+                )
+            else:
+                raise ValueError(
+                    f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations."
+                )
+    def __repr__(self):
+        return (
+            f"TaskOutput(task_name={self.task_name}, "
+            f"group_name={self.group_name}, "
+            f"version={self.version}, "
+            f"n_shot={self.n_shot}, "
+            f"task_alias={self.task_alias}, "
+            f"group_alias={self.group_alias})"
+        )
+def get_task_list(task_dict: dict) -> List[TaskOutput]:
+    outputs = []
+    for task_name, task_obj in task_dict.items():
+        if isinstance(task_obj, dict):
+            _outputs = get_task_list(task_obj)
+            outputs.extend(_outputs)
+        else:
+            task_output = TaskOutput.from_taskdict(task_name, task_obj)
+            outputs.append(task_output)
+    return outputs
+def get_subtask_list(task_dict, task_root=None, depth=0):
+    subtask_list = {}
+    for group_obj, task_obj in task_dict.items():
+        if isinstance(group_obj, ConfigurableGroup):
+            # group_name = group_obj.group_name
+            group_name = group_obj.group_name
+        else:
+            group_name = group_obj
+        if isinstance(task_obj, dict):
+            _subtask_list = get_subtask_list(
+                task_obj, task_root=group_name, depth=depth + 1
+            )
+            if task_root:
+                subtask_list.setdefault((task_root, depth), []).extend(
+                    [
+                        _task
+                        for (_task, _depth) in _subtask_list.keys()
+                        if (_depth - 1) == depth
+                    ]
+                )
+            subtask_list = {**subtask_list, **_subtask_list}
+        else:
+            if isinstance(task_obj, ConfigurableGroup):
+                # group_or_task_name = task_obj.group_name
+                group_or_task_name = task_obj.group_name
+            elif isinstance(task_obj, Task):
+                # group_or_task_name = task_obj.task_name
+                group_or_task_name = task_obj.task_name
+            if task_root is None:
+                subtask_list.setdefault((group_or_task_name, depth), [])
+            else:
+                subtask_list.setdefault((task_root, depth), []).append(
+                    group_or_task_name
+                )
+    if depth == 0:
+        _subtask_list = {}
+        for group_key, task_list in subtask_list.items():
+            group_name, depth = group_key
+            _subtask_list[group_name] = task_list
+        subtask_list = _subtask_list
+    return subtask_list
+def print_writeout(task) -> None:
+    for inst in task.instances:
+        # print the prompt for the first few documents
+        if inst.doc_id < 1:
+            eval_logger.info(
+                f"Task: {task}; document {inst.doc_id}; context prompt (starting on next line):\
+    \n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
+            )
+            eval_logger.info(f"Request: {str(inst)}")
+def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
+    if limit is not None:
+        limit = (
+            int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit)
+        )
+    return limit
+def prepare_print_tasks(
+    task_dict: dict,
+    results: dict,
+    task_depth=0,
+    group_depth=0,
+) -> Tuple[dict, dict]:
+    """
+    @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
+    value is a list of task names.
+    @param results: Dictionary containing the results of each task. Each key is a
+    group name and its value is a dictionary of task results.
+    @param task_depth: The indentation level for printing the task
+    hierarchy. Default is 0.
+    @param group_depth: The indentation level for printing the group
+    hierarchy. Default is 0.
+    @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
+    aggregated results for each task, and groups_agg contains aggregated results for each group.
+    Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
+    """
+    def _sort_task_dict(task_dict):
+        """
+        Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
+        Required so that we end up sorting within each sub-header correctly.
+        """
+        return dict(
+            sorted(
+                task_dict.items(),
+                key=lambda item: item[0].group_name
+                if isinstance(item[0], ConfigurableGroup)
+                else item[0],
+            )
+        )
+    task_agg = collections.defaultdict(dict)
+    group_agg = collections.defaultdict(dict)
+    task_dict = _sort_task_dict(task_dict)
+    for task_or_group_name, task_or_group_obj in task_dict.items():
+        tab_string = " " * task_depth + "- " if task_depth > 0 else ""
+        if isinstance(task_or_group_name, ConfigurableGroup):
+            # string_name = task_or_group_name.group_name
+            name = task_or_group_name.group_name
+            from_configurable_group = True
+            task_or_group_obj = _sort_task_dict(task_or_group_obj)
+        elif isinstance(task_or_group_name, str):
+            name = task_or_group_name
+            if isinstance(task_or_group_obj, Task):
+                # string_name = task_or_group_obj.task_name
+                name = task_or_group_obj.task_name
+            from_configurable_group = False
+        task_agg[name] = results[name].copy()
+        if from_configurable_group:
+            if task_or_group_name.group_alias is not None:
+                alias = task_or_group_name.group_alias
+            else:
+                alias = task_or_group_name.group
+        else:
+            if "alias" in task_agg[name]:
+                alias = task_agg[name]["alias"]
+            else:
+                alias = name
+        task_agg[name]["alias"] = tab_string + alias
+        if "samples" in task_agg[name]:
+            task_agg[name].pop("samples")
+        if from_configurable_group and (" " not in results[name]):
+            group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
+            group_agg[name] = results[name].copy()
+            group_agg[name]["alias"] = group_tab_string + alias
+            if "samples" in group_agg[name]:
+                group_agg[name].pop("samples")
+        if isinstance(task_or_group_obj, dict):
+            task_depth += 1
+            group_depth += 1
+            _task_agg, _group_agg = prepare_print_tasks(
+                task_or_group_obj, results, task_depth, group_depth
+            )
+            task_agg = {
+                **task_agg,
+                **_task_agg,
+            }
+            group_agg = {**group_agg, **_group_agg}
+            task_depth -= 1
+            group_depth -= 1
+    return task_agg, group_agg
+def consolidate_results(
+    eval_tasks: List[TaskOutput],
+) -> Tuple[dict, dict, dict, dict, dict, dict]:
+    """
+    @param eval_tasks: list(TaskOutput).
+    @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
+    Consolidates the results of multiple evaluation tasks into a single structure.
+    The method iterates over each evaluation instance and extracts relevant information to create the consolidated
+    results structure. The consolidated results structure has the following properties:
+    - results: A defaultdict with task names as keys and dictionaries as values. Each dictionary contains
+    metric/filter pairs as keys and corresponding metric values as values. The "alias" key is used to store task
+    aliases specified in the task configuration.
+    - samples: A defaultdict with task names as keys and lists of log samples as values.
+    - configs: A defaultdict with task names as keys and task configurations as values.
+    - versions: A defaultdict with task names as keys and task versions as values.
+    - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
+    - higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better
+    for each metric as values.
+    The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
+    """
+    # stores the final result for each task, for each metric/filter pair.
+    results = collections.defaultdict(dict)
+    # logs info about each document evaluated.
+    samples = collections.defaultdict(list)
+    # store num-fewshot value per task
+    num_fewshot = collections.defaultdict(int)
+    # Tracks the YAML configs of all chosen task
+    configs = collections.defaultdict(dict)
+    # Tracks each task's version.
+    versions = collections.defaultdict(dict)
+    # Track `higher_is_better` for each metric
+    higher_is_better = collections.defaultdict(dict)
+    for task_output in eval_tasks:
+        if "task_alias" in (task_config := task_output.task_config):
+            results[task_output.task_name]["alias"] = task_config["task_alias"]
+        else:
+            results[task_output.task_name]["alias"] = task_output.task_name
+        if group_alias := task_output.group_alias:
+            if group_alias not in results and (group_name := task_output.group_name):
+                results[group_name]["alias"] = group_alias
+        num_fewshot[task_output.task_name] = task_output.n_shot
+        configs[task_output.task_name] = task_output.task_config
+        versions[task_output.task_name] = task_output.version
+        samples[task_output.task_name] = task_output.logged_samples
+        higher_is_better[task_output.task_name] = task_output.task.higher_is_better()
+        for (metric, filter_key), items in task_output.sample_metrics.items():
+            metric_key = f"{metric},{filter_key}"
+            results[task_output.task_name][metric_key] = task_output.agg_metrics[
+                metric_key
+            ]
+            results[task_output.task_name]["samples"] = task_output.sample_len
+            results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
+                task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+            )
+    return results, samples, configs, versions, num_fewshot, higher_is_better
+def consolidate_group_results(
+    results,
+    versions,
+    task_dict,
+    task_root=None,
+    show_group_table=False,
+    task_aggregation_list=None,
+) -> Tuple[dict, dict, bool, Union[None,]]:
+    """
+    (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
+    @return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
+    - results: A defaultdict with task names (and, after this function is called, group names of
+    groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
+    - versions: A defaultdict with task names (and, after this function is called, group names of
+    groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
+    - show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
+    - task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
+    The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
+    In the top-level invocation of this function, task_aggregation_list is ignored.
+    """
+    if task_root is None:
+        task_root = {}
+    if task_aggregation_list is None:
+        task_aggregation_list = {}
+    for group_or_task, group_or_task_info in task_dict.items():
+        # Convert to string
+        if isinstance(group_or_task, ConfigurableGroup):
+            group_config = group_or_task.config
+            group_or_task = group_or_task.group_name
+        else:
+            group_config = None
+        if isinstance(group_or_task_info, Task):
+            if task_root:
+                task_aggregation_list.setdefault(task_root, []).append(
+                    group_or_task_info.task_name
+                )
+        else:
+            (
+                results,
+                versions,
+                show_group_table,
+                _task_aggregation_list,
+            ) = consolidate_group_results(
+                results,
+                versions,
+                group_or_task_info,
+                group_or_task,
+                show_group_table,
+                task_aggregation_list,
+            )
+            if task_root:
+                task_aggregation_list.setdefault(task_root, []).extend(
+                    task_aggregation_list.get(group_or_task, [])
+                )
+            if (group_config is None) or (
+                group_config["aggregate_metric_list"] is None
+            ):
+                results[group_or_task][" "] = " "
+                continue
+            if "aggregate_metric_list" in group_config:
+                agg_metric_list = group_config["aggregate_metric_list"]
+            show_group_table = show_group_table | bool(
+                group_config["aggregate_metric_list"]
+            )
+            task_list = _task_aggregation_list[group_or_task]
+            metric_list = list(
+                {
+                    key
+                    for task in task_list
+                    for key in results[task].keys()
+                    if "_stderr" not in key and key not in ["task", "alias", "samples"]
+                }
+            )
+            for metric in metric_list:
+                stderr = "_stderr,".join(metric.split(","))
+                # gather metrics, sizes, and stderrs from subtasks
+                metrics = [
+                    results[task][metric]
+                    for task in task_list
+                    if metric in results[task]
+                ]  # TODO: copy?
+                stderrs = [
+                    results[task][stderr]
+                    for task in task_list
+                    if stderr in results[task]
+                ]
+                sizes = [
+                    results[task]["samples"]
+                    for task in task_list
+                    if metric in results[task]
+                ]
+                for metric_config in agg_metric_list:
+                    for filter_name in metric_config["filter_list"]:
+                        if metric != ",".join([metric_config["metric"], filter_name]):
+                            continue
+                        # compute group's pooled metric and stderr
+                        if metric_config["aggregation"] == "mean":
+                            aggregate_fn = aggregate_subtask_metrics
+                        elif callable(metric_config["aggregation"]):
+                            aggregate_fn = metric_config["aggregation"]
+                        else:
+                            raise ValueError(
+                                f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
+                            )
+                        results[group_or_task][metric] = aggregate_fn(
+                            metrics,
+                            sizes,
+                            metric_config["weight_by_size"],
+                        )
+                        # TODO: calculate groups' metrics using arbitrary agg fns
+                        if "N/A" in stderrs:
+                            results[group_or_task][stderr] = "N/A"
+                        else:
+                            # NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
+                            results[group_or_task][stderr] = pooled_sample_stderr(
+                                stderrs, sizes
+                            )
+                results[group_or_task]["samples"] = sum(sizes)
+                group_metadata = group_config.get("metadata", None)
+                if group_metadata is not None:
+                    versions[group_or_task] = group_metadata.get("version", None)
+    # print(results)
+    return results, versions, show_group_table, task_aggregation_list
+@positional_deprecated
+def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
+    """
+    Search upward in the directory tree to a maximum of three layers
+    to find and return the package root (containing the 'tests' folder)
+    """
+    cur_path = start_path.resolve()
+    max_layers = 3
+    for _ in range(max_layers):
+        if (cur_path / "tests" / "test_version_stable.py").exists():
+            return cur_path
+        else:
+            cur_path = cur_path.parent.resolve()
+    raise FileNotFoundError(
+        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
+    )
+@positional_deprecated
+def run_task_tests(task_list: List[str]):
+    """
+    Find the package root and run the tests for the given tasks
+    """
+    import pytest
+    package_root = find_test_root(start_path=pathlib.Path(__file__))
+    task_string = " or ".join(task_list)
+    args = [
+        f"{package_root}/tests/test_version_stable.py",
+        f"--rootdir={package_root}",
+        "-k",
+        f"{task_string}",
+    ]
+    sys.path.append(str(package_root))
+    pytest_return_val = pytest.main(args)
+    if pytest_return_val:
+        raise ValueError(
+            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
+        )

Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from functools import partial
+from typing import List
+from dllm_eval.api.filter import FilterEnsemble
+from dllm_eval.api.registry import get_filter
+from . import custom, extraction, selection, transformation
+def build_filter_ensemble(
+    filter_name: str, components: List[List[str]]
+) -> FilterEnsemble:
+    """
+    Create a filtering pipeline.
+    """
+    filters = []
+    for function, kwargs in components:
+        if kwargs is None:
+            kwargs = {}
+        # create a filter given its name in the registry
+        f = partial(get_filter(function), **kwargs)
+        # add the filter as a pipeline step
+        filters.append(f)
+    return FilterEnsemble(name=filter_name, filters=filters)

Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/custom.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from dllm_eval.api.filter import Filter
+from dllm_eval.api.registry import register_filter
+@register_filter("custom")
+class CustomFilter(Filter):
+    """
+    Custom filter that applies a custom, user-defined function to the model responses.
+    """
+    def __init__(self, **kwargs) -> None:
+        self.filter_fn = kwargs.pop("filter_fn")
+        super().__init__(**kwargs)
+    def apply(self, resps, docs):
+        return self.filter_fn(resps, docs)

Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/decontamination.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from dllm_eval.api.filter import Filter
+from dllm_eval.api.registry import register_filter
+@register_filter("decontaminate")
+class DecontaminationFilter(Filter):
+    """
+    A filter which evaluates
+    """
+    name = "track_decontamination"
+    def __init__(self, path) -> None:
+        """
+        TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
+        should further cache result on a given (task_name, doc_id)
+        """
+        self._decontam_results = None
+    def apply(self, resps, docs) -> None:
+        """
+        Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
+        """
+        pass

Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/extraction.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import re
+import sys
+import unicodedata
+from dllm_eval.api.filter import Filter
+from dllm_eval.api.registry import register_filter
+@register_filter("regex")
+class RegexFilter(Filter):
+    """A filter that extracts values from text using regex pattern matching.
+    This filter applies a regex pattern to each model response and extracts matched values.
+    If no match is found, returns a fallback value. Useful for extracting structured data
+    (like numbers) from unstructured model outputs.
+    """
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select: int = 0,
+        fallback: str = "[invalid]",
+    ) -> None:
+        """
+        pass a string `regex` to run `re.compile(r"regex")` on.
+        `fallback` defines the output returned if no matches for the regex are located.
+        """
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
+        self.fallback = fallback
+    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+        def filter_set(inst):
+            filtered = []
+            for resp in inst:
+                match = self.regex.findall(resp)
+                if match:
+                    match = match[self.group_select]
+                    if isinstance(match, tuple):
+                        match = [m for m in match if m]
+                        if match:
+                            match = match[0]
+                        else:
+                            match = self.fallback
+                    match = match.strip()
+                else:
+                    match = self.fallback
+                filtered.append(match)
+            return filtered
+        filtered_resps = list(map(lambda x: filter_set(x), resps))
+        return filtered_resps
+@register_filter("regex_pos")
+class POSFilter(Filter):
+    """ """
+    def __init__(
+        self,
+        regex_pattern: str = r"\['(.*?)'\]",
+        group_select=0,
+        fallback=None,
+    ) -> None:
+        """
+        pass a string `regex` to run `re.compile(r"regex")` on.
+        `fallback` defines the output returned if no matches for the regex are located.
+        """
+        if fallback is None:
+            fallback = ["invalid"]
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
+        self.fallback = fallback
+    def apply(self, resps, docs):
+        def extract_tagged_tokens(text):
+            # Extract tagged tokens list from text input using regex
+            tokens = re.findall(r"\('([^']*)', '([^']*)'\)", text)
+            return [(token, pos) for token, pos in tokens]
+        def extract_pos_tags(result):
+            pos_tags = []
+            if isinstance(result, str):
+                result = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result)
+            return pos_tags if pos_tags else self.fallback
+        def filter_set(inst):
+            filtered = []
+            for resp in inst:
+                match = extract_pos_tags(resp)
+                filtered.append(match)
+            return filtered
+        filtered_resps = map(lambda x: filter_set(x), resps)
+        return filtered_resps
+@register_filter("remove_whitespace")
+class WhitespaceFilter(Filter):
+    """Filters out leading whitespace from responses."""
+    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+        def filter_set(inst):
+            filtered_resp = []
+            for resp in inst:
+                resp = resp.lstrip()
+                filtered_resp.append(resp)
+            return filtered_resp
+        filtered_resps = [filter_set(resp) for resp in resps]
+        return filtered_resps
+@register_filter("multi_choice_regex")
+class MultiChoiceRegexFilter(RegexFilter):
+    """
+    A filter used to extract a model's answer on multiple choice questions with
+    letter answers. assumes each document has a "choices" field
+    containing the list of answer choices and that the answer label symbols
+    are of the form (A), (B), (C), ... or A, B, C.
+    """
+    def __init__(
+        self,
+        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
+        group_select=0,
+        fallback: str = "[invalid]",
+        ignore_case=False,
+        ignore_punctuation=False,
+        regexes_to_ignore=None,
+    ) -> None:
+        """
+        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
+                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
+                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
+        group_select: Selects the (group_select)th match from the findall result.
+        ignore_case: Ignores the case during step 1 matching
+        ignore_punctuation: Remove the punctuation during step 1 matching
+        regexes_to_ignore: Remove these regexes during step 1 matching
+        """
+        super().__init__(regex_pattern, group_select, fallback)
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.regexes_to_ignore = regexes_to_ignore
+    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+        def find_match(regex, resp, convert_dict={}):
+            match = regex.findall(resp)
+            if match:
+                match = match[self.group_select]
+                if isinstance(match, tuple):
+                    match = [m for m in match if m][0]
+                match = match.strip()
+                if match and match in convert_dict:
+                    match = convert_dict[match]
+            return match
+        punct_tbl = dict.fromkeys(
+            i
+            for i in range(sys.maxunicode)
+            if unicodedata.category(chr(i)).startswith("P")
+        )
+        def filter_ignores(st):
+            if self.regexes_to_ignore is not None:
+                for s in self.regexes_to_ignore:
+                    st = re.sub(s, "", st)
+            if self.ignore_case:
+                st = st.lower()
+            if self.ignore_punctuation:
+                # https://stackoverflow.com/a/266162
+                st = st.translate(punct_tbl)
+            return st
+        filtered_resps = []
+        for r, doc in zip(resps, docs):
+            fallback_regexes = []
+            choice_to_alpha = {}
+            next_alpha = "A"
+            without_paren_fallback_regexes = []
+            without_paren_to_target = {}
+            choices = doc["choices"]
+            for c in choices:
+                m = filter_ignores(c.strip())
+                fallback_regexes.append(f"{re.escape(m)}")
+                choice_to_alpha[m] = f"({next_alpha})"
+                without_paren_fallback_regexes.append(next_alpha)
+                without_paren_to_target[next_alpha] = f"({next_alpha})"
+                next_alpha = chr(ord(next_alpha) + 1)
+            fallback_regex = re.compile("|".join(fallback_regexes))
+            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
+            without_paren_fallback_regex = re.compile(
+                rf":[\s]*({without_paren_fallback_regex})"
+            )
+            filtered = []
+            for resp in r:
+                match = find_match(self.regex, resp)
+                if not match:
+                    match = find_match(
+                        fallback_regex, filter_ignores(resp), choice_to_alpha
+                    )
+                    if not match:
+                        match = find_match(
+                            without_paren_fallback_regex, resp, without_paren_to_target
+                        )
+                if not match:
+                    match = self.fallback
+                filtered.append(match)
+            filtered_resps.append(filtered)
+        return filtered_resps

Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/selection.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from collections import Counter
+from dllm_eval.api.filter import Filter
+from dllm_eval.api.registry import register_filter
+# TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function
+# that takes an input and returns a scalar and then should select the max reward,
+# or should implement different filters for different ways of handling a reward model's inference.
+@register_filter("take_first")
+class TakeFirstFilter(Filter):
+    def __init__(self) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    def apply(self, resps, docs):
+        """
+        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
+        """
+        return map(lambda r: r[0], resps)
+@register_filter("take_first_k")
+class TakeKFilter(Filter):
+    def __init__(self, **kwargs) -> None:
+        self.k = kwargs.pop("k")
+        super().__init__(**kwargs)
+    def apply(self, resps, docs):
+        # need resp to be subscriptable to check below
+        resps = list(resps)
+        # check we have at least k responses per doc, else we can't take the first k
+        assert len(resps[0]) >= self.k, (
+            f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
+        )
+        return map(lambda r: r[: self.k], resps)
+@register_filter("majority_vote")
+class MajorityVoteFilter(Filter):
+    def __init__(self) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    def apply(self, resps, docs):
+        """
+        Each entry of `resps` is a list of model responses.
+        We select the response that occurs most frequently in each entry of `resps`.
+        """
+        def select_majority(resp):
+            counts = Counter(resp)
+            vote = counts.most_common(1)[0][0]
+            return vote
+        return map(lambda r: [select_majority(r)], resps)

Prism/LLaDA/LLaDA_Baseline/dllm_eval/filters/transformation.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import re
+from dllm_eval.api.filter import Filter
+from dllm_eval.api.registry import register_filter
+@register_filter("lowercase")
+class LowercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.lower() for resp in inst]
+        return [filter_set(resp) for resp in resps]
+@register_filter("uppercase")
+class UppercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.upper() for resp in inst]
+        return [filter_set(resp) for resp in resps]
+@register_filter("map")
+class MapFilter(Filter):
+    def __init__(self, mapping_dict: dict = None, default_value=None) -> None:
+        """
+        Initializes the MapFilter with a given mapping dictionary and default value.
+        Args:
+        - mapping_dict (dict): A dictionary containing the key-value mappings.
+                               Default is an empty dictionary.
+        - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
+                               Default is None.
+        Example:
+        mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
+        """
+        if mapping_dict is None:
+            mapping_dict = {}
+        assert isinstance(mapping_dict, dict), (
+            "Provided mapping_dict is not a dictionary"
+        )
+        self.mapping_dict = mapping_dict
+        self.default_value = default_value
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
+        return [filter_set(resp) for resp in resps]
+@register_filter("format_span")
+class SPANFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def format_ner_text(text):
+            label_dict = {
+                "person": "PER",
+                "location": "LOC",
+                "organization": "ORG",
+                "counties": "LOC",
+                "places": "LOC",
+                "people": "PER",
+                "persons": "PER",
+                "company": "ORG",
+                "country": "LOC",
+                "continent": "LOC",
+                "time": "DATE",
+                "date": "DATE",
+                "per": "PER",
+                "loc": "LOC",
+                "org": "ORG",
+            }
+            text = text.lower()
+            for key, value in label_dict.items():
+                text = text.replace(key, value)
+            text = "$".join(i for i in text.split("$$"))
+            return text.rstrip("$$")
+        def format_named_entities(text):
+            """
+            Extract named entities from text and format them as 'label: value $$ label: value'.
+            Handles grouped entities (e.g., LOC: kenya, uganda) and excludes 'none' values.
+            """
+            # Regular expression to match label: entities pattern
+            pattern = r"\b(PER|LOC|ORG|DATE):\s*([^$]+)"
+            # Normalize newline characters
+            text = text.replace("\n", "$").strip()
+            matches = re.findall(pattern, text)
+            formatted_entities = []
+            for label, values in matches:
+                # Split multiple entities separated by commas and strip whitespace
+                entities = [value.strip() for value in values.split(",")]
+                # Exclude 'none' entities
+                for entity in entities:
+                    if entity.lower() != "none":
+                        formatted_entities.append(f"{label.lower()}: {entity}")
+            # Join entities with the desired separator
+            return " $ ".join(formatted_entities)
+        def filter_set(inst):
+            return [
+                format_named_entities(format_ner_text(resp.lower())) for resp in inst
+            ]
+        return [filter_set(resp) for resp in resps]

Prism/LLaDA/LLaDA_Baseline/dllm_eval/utils.py ADDED Viewed

	@@ -0,0 +1,552 @@

+import collections
+import fnmatch
+import functools
+import hashlib
+import importlib.util
+import inspect
+import json
+import logging
+import os
+import re
+from dataclasses import asdict, is_dataclass
+from itertools import islice
+from pathlib import Path
+from typing import Any, Callable, Generator, List, Optional, Tuple
+import numpy as np
+import yaml
+from jinja2 import BaseLoader, Environment, StrictUndefined
+SPACING = " " * 47
+HIGHER_IS_BETTER_SYMBOLS = {
+    True: "↑",
+    False: "↓",
+}
+def setup_logging(verbosity=logging.INFO):
+    # Configure the root logger
+    class CustomFormatter(logging.Formatter):
+        def format(self, record):
+            if record.name.startswith("dllm_eval."):
+                record.name = record.name[len("dllm_eval.") :]
+            return super().format(record)
+    formatter = CustomFormatter(
+        "%(asctime)s %(levelname)-8s [%(name)s:%(lineno)d] %(message)s",
+        datefmt="%Y-%m-%d:%H:%M:%S",
+    )
+    log_level = os.environ.get("LOGLEVEL", verbosity) or verbosity
+    level_map = {
+        "DEBUG": logging.DEBUG,
+        "INFO": logging.INFO,
+        "WARNING": logging.WARNING,
+        "ERROR": logging.ERROR,
+        "CRITICAL": logging.CRITICAL,
+    }
+    log_level = level_map.get(str(log_level).upper(), logging.INFO)
+    if not logging.root.handlers:
+        handler = logging.StreamHandler()
+        handler.setFormatter(formatter)
+        root_logger = logging.getLogger()
+        root_logger.addHandler(handler)
+        root_logger.setLevel(log_level)
+        if log_level == logging.DEBUG:
+            third_party_loggers = ["urllib3", "filelock", "fsspec"]
+            for logger_name in third_party_loggers:
+                logging.getLogger(logger_name).setLevel(logging.INFO)
+    else:
+        logging.getLogger().setLevel(log_level)
+def hash_string(string: str) -> str:
+    return hashlib.sha256(string.encode("utf-8")).hexdigest()
+def escaped_split(text, sep_char, maxsplit=-1):
+    """Split text into a list on occurrences of the given separation
+    character `sep_char`. The separation character may be escaped by a
+    backslash to avoid splitting at that location.
+    The separation character must be a string of size 1.
+    If `maxsplit` is given, at most `maxsplit` splits are done (thus,
+    the list will have at most `maxsplit + 1` elements). If `maxsplit`
+    is not specified or less than 0, then there is no limit on the
+    number of splits (all possible splits are made).
+    """
+    assert len(sep_char) == 1, (
+        "separation string must be a single character for escaped splitting"
+    )
+    if maxsplit == 0:
+        return text
+    maxsplit = max(0, maxsplit)
+    return re.split(r"(?<!\\)" + sep_char, text, maxsplit)
+def handle_arg_string(arg):
+    if arg.lower() == "true":
+        return True
+    elif arg.lower() == "false":
+        return False
+    elif arg.isnumeric():
+        return int(arg)
+    try:
+        return float(arg)
+    except ValueError:
+        return arg
+def handle_non_serializable(o):
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+def sanitize_list(sub):
+    """
+    Takes possible nested list and recursively converts all inner component to strings
+    """
+    if isinstance(sub, list):
+        return [sanitize_list(item) for item in sub]
+    if isinstance(sub, tuple):
+        return tuple(sanitize_list(item) for item in sub)
+    else:
+        return str(sub)
+def simple_parse_args_string(args_string: Optional[str]) -> dict:
+    """
+    Parses something like
+        args1=val1,arg2=val2
+    Into a dictionary
+    """
+    if args_string is None:
+        return {}
+    args_string = args_string.strip()
+    if not args_string:
+        return {}
+    arg_list = [arg for arg in args_string.split(",") if arg]
+    args_dict = {
+        kv[0]: handle_arg_string("=".join(kv[1:]))
+        for kv in [arg.split("=") for arg in arg_list]
+    }
+    return args_dict
+def join_iters(iters):
+    for iter in iters:
+        yield from iter
+def group(arr, fn):
+    res = collections.defaultdict(list)
+    for ob in arr:
+        res[fn(ob)].append(ob)
+    return list(res.values())
+# Returns a list containing all values of the source_list that
+# match at least one of the patterns
+def pattern_match(patterns, source_list):
+    if isinstance(patterns, str):
+        patterns = [patterns]
+    task_names = set()
+    for pattern in patterns:
+        for matching in fnmatch.filter(source_list, pattern):
+            task_names.add(matching)
+    return sorted(list(task_names))
+def softmax(x) -> np.ndarray:
+    """Compute softmax values for each sets of scores in x."""
+    e_x = np.exp(x - np.max(x))
+    return e_x / e_x.sum()
+def general_detokenize(string) -> str:
+    string = string.replace(" n't", "n't")
+    string = string.replace(" )", ")")
+    string = string.replace("( ", "(")
+    string = string.replace('" ', '"')
+    string = string.replace(' "', '"')
+    string = re.sub(r" (['.,])", r"\1", string)
+    return string
+def get_file_task_name(filename: str) -> str:
+    """
+    Given the sample results filenames, extracts and returns the task name.
+    """
+    return filename[filename.find("_") + 1 : filename.rfind("_")]
+def get_file_datetime(filename: str) -> str:
+    """
+    Given the results and sample results filenames, extracts and returns the datetime.
+    """
+    return filename[filename.rfind("_") + 1 :].replace(".jsonl", "")
+def sanitize_model_name(model_name: str) -> str:
+    """
+    Given the model name, returns a sanitized version of it.
+    """
+    return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
+def sanitize_task_name(task_name: str) -> str:
+    """
+    Given the task name, returns a sanitized version of it.
+    """
+    return re.sub(r"\W", "_", task_name)
+def get_latest_filename(filenames: List[str]) -> str:
+    """
+    Given a list of filenames, returns the filename with the latest datetime.
+    """
+    return max(filenames, key=lambda f: get_file_datetime(f))
+def get_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to aggregated results.
+    """
+    return [f for f in filenames if "/results_" in f and ".json" in f]
+def get_sample_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to sample results.
+    """
+    return [f for f in filenames if "/samples_" in f and ".json" in f]
+def get_rolling_token_windows(
+    token_list: List[int], prefix_token: int, max_seq_len: int, context_len: int
+) -> Generator[Tuple[List[int], List[int]], None, None]:
+    """
+    - context_len allows for a rolling window context, allowing each prediction window to potentially
+      condition on some context
+    :param token_list: list
+        List of tokens to be PREDICTED
+    :param max_seq_len: int
+        max_seq_len of model (or max_seq_len we want to use)
+    :param context_len: int
+        Amount of desired token context for prediction. Needs to be at least 1.
+    :param prefix_token: token
+        Dummy token like <eos> so the first token has something to condition on
+    :return: generator
+        Generator of tuples
+            (input_tokens, pred_tokens)
+        Note: Score only the last len(pred_tokens) logits of the LM
+    """
+    assert 1 <= context_len <= max_seq_len
+    if not token_list:
+        return
+    # +1 offset, going from input->preds
+    pred_len = max_seq_len - context_len + 1
+    predicted = 0
+    # Special handling for first window: predict all tokens
+    first_seq_len = min(max_seq_len, len(token_list))
+    yield [prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len]
+    predicted += first_seq_len
+    while predicted < len(token_list):
+        window_pred_len = min(len(token_list) - predicted, pred_len)
+        window_end = predicted + window_pred_len
+        yield (
+            token_list[window_end - max_seq_len - 1 : window_end - 1],
+            token_list[window_end - window_pred_len : window_end],
+        )
+        predicted += window_pred_len
+def make_disjoint_window(
+    pair: Tuple[List[int], List[int]],
+) -> Tuple[List[int], List[int]]:
+    """Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
+    a, b = pair
+    return a[: len(a) - (len(b) - 1)], b
+class EnhancedJSONEncoder(json.JSONEncoder):
+    """
+    Provides a proper json encoding for the loggers and trackers json dumps.
+    Notably manages the json encoding of dataclasses.
+    """
+    def default(self, o):
+        if is_dataclass(o):
+            return asdict(o)
+        return super().default(o)
+class Reorderer:
+    def __init__(self, arr: List[Any], fn: Callable) -> None:
+        """Reorder an array according to some function
+        Args:
+            arr (List[Any]): The initial array
+            fn (Callable[[Any], Any]): A function to determine the priority of elements
+        """
+        self.size = len(arr)
+        arr = list(enumerate(arr))
+        arr = group(arr, lambda x: fn(x[1]))
+        # arr = [([y[0] for y in x], x[0][1]) for x in arr]
+        # TODO: overhaul reorderer. It currently grouped requests by content but we don't want this
+        arr = [([y[0]], x[0][1]) for x in arr for y in x]
+        arr.sort(key=lambda x: fn(x[1]))
+        self.arr = arr
+    def get_reordered(self):
+        """Gets the reordered array
+        Returns:
+            List[Any]: The reordered array
+        """
+        return [x[1] for x in self.arr]
+    def get_original(self, newarr):
+        """Restores the original order of a new array based on the old array's order
+        Args:
+            newarr (List[Any]): The array to be restored
+        Returns:
+            List[Any]: The array restored to the original order
+        """
+        res = [None] * self.size
+        cov = [False] * self.size
+        for (inds, _), v in zip(self.arr, newarr):
+            for ind in inds:
+                res[ind] = v
+                cov[ind] = True
+        assert all(cov)
+        return res
+def make_table(result_dict, column: str = "results", sort_results: bool = False):
+    """Generate table of results."""
+    from pytablewriter import LatexTableWriter, MarkdownTableWriter
+    if column == "results":
+        column_name = "Tasks"
+    elif column == "groups":
+        column_name = "Groups"
+    all_headers = [
+        column_name,
+        "Version",
+        "Filter",
+        "n-shot",
+        "Metric",
+        "",
+        "Value",
+        "",
+        "Stderr",
+    ]
+    md_writer = MarkdownTableWriter()
+    latex_writer = LatexTableWriter()
+    md_writer.headers = all_headers
+    latex_writer.headers = all_headers
+    values = []
+    keys = result_dict[column].keys()
+    if sort_results:
+        # sort entries alphabetically by task or group name.
+        # NOTE: we default here to false, because order matters for multi-level table printing a la mmlu.
+        # sorting here would mess that up
+        keys = sorted(keys)
+    for k in keys:
+        dic = result_dict[column][k]
+        version = result_dict["versions"].get(k, "    N/A")
+        n = str(result_dict.get("n-shot", " ").get(k, " "))
+        higher_is_better = result_dict.get("higher_is_better", {}).get(k, {})
+        if "alias" in dic:
+            k = dic.pop("alias")
+        metric_items = dic.items()
+        metric_items = sorted(metric_items)
+        for (mf), v in metric_items:
+            m, _, f = mf.partition(",")
+            if m.endswith("_stderr"):
+                continue
+            hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
+            v = "%.4f" % v if isinstance(v, float) else v
+            if m + "_stderr" + "," + f in dic:
+                se = dic[m + "_stderr" + "," + f]
+                se = "   N/A" if se == "N/A" else "%.4f" % se
+                values.append([k, version, f, n, m, hib, v, "±", se])
+            else:
+                values.append([k, version, f, n, m, hib, v, "", ""])
+            k = ""
+            version = ""
+    md_writer.value_matrix = values
+    latex_writer.value_matrix = values
+    # todo: make latex table look good
+    # print(latex_writer.dumps())
+    return md_writer.dumps()
+def positional_deprecated(fn):
+    """
+    A decorator to nudge users into passing only keyword args (`kwargs`) to the
+    wrapped function, `fn`.
+    """
+    @functools.wraps(fn)
+    def _wrapper(*args, **kwargs):
+        if len(args) != 1 if inspect.ismethod(fn) else 0:
+            print(
+                f"WARNING: using {fn.__name__} with positional arguments is "
+                "deprecated and will be disallowed in a future version of "
+                "lm-evaluation-harness!"
+            )
+        return fn(*args, **kwargs)
+    return _wrapper
+def ignore_constructor(loader, node):
+    return node
+def import_function(loader: yaml.Loader, node, yaml_path: Path):
+    function_name = loader.construct_scalar(node)
+    *module_name, function_name = function_name.split(".")
+    if isinstance(module_name, list):
+        module_name = ".".join(module_name)
+    module_path = yaml_path.parent / f"{module_name}.py"
+    spec = importlib.util.spec_from_file_location(module_name, module_path.as_posix())
+    if spec is None:
+        raise ImportError(f"Could not import module {module_name} from {module_path}.")
+    module = importlib.util.module_from_spec(spec)
+    if spec.loader is None:
+        raise ImportError(f"Module loader is None, {module_name} from {module_path}.")
+    spec.loader.exec_module(module)
+    function = getattr(module, function_name)
+    return function
+def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None, mode="full"):
+    if mode == "simple":
+        constructor_fn = ignore_constructor
+    elif mode == "full":
+        if yaml_path is None:
+            raise ValueError("yaml_path must be provided if mode is 'full'.")
+        # Attach yaml_path to the import function so that it can be used later
+        constructor_fn = functools.partial(import_function, yaml_path=Path(yaml_path))
+    loader = yaml.CLoader if yaml.__with_libyaml__ else yaml.FullLoader
+    # Add the import_function constructor to the YAML loader
+    yaml.add_constructor("!function", constructor_fn, Loader=loader)
+    if yaml_config is None:
+        with open(yaml_path, "rb") as file:
+            yaml_config = yaml.load(file, Loader=loader)
+    if yaml_dir is None:
+        yaml_dir = os.path.dirname(yaml_path)
+    assert yaml_dir is not None
+    if "include" in yaml_config:
+        include_path = yaml_config["include"]
+        del yaml_config["include"]
+        if isinstance(include_path, str):
+            include_path = [include_path]
+        # Load from the last one first
+        include_path.reverse()
+        final_yaml_config = {}
+        for path in include_path:
+            # Assumes that path is a full path.
+            # If not found, assume the included yaml
+            # is in the same dir as the original yaml
+            if not os.path.isfile(path):
+                path = os.path.join(yaml_dir, path)
+            try:
+                included_yaml_config = load_yaml_config(yaml_path=path, mode=mode)
+                final_yaml_config.update(included_yaml_config)
+            except Exception as ex:
+                # If failed to load, ignore
+                raise ex
+        final_yaml_config.update(yaml_config)
+        return final_yaml_config
+    return yaml_config
+def regex_replace(string, pattern, repl, count: int = 0):
+    """Implements the `re.sub` function as a custom Jinja filter."""
+    return re.sub(pattern, repl, string, count=count)
+env = Environment(
+    loader=BaseLoader, undefined=StrictUndefined, keep_trailing_newline=True
+)
+env.filters["regex_replace"] = regex_replace
+def apply_template(template: str, doc: dict) -> str:
+    rtemplate = env.from_string(template)
+    return rtemplate.render(**doc)
+def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None):
+    """
+    Method for creating a (potentially) sliced and limited
+    iterator from a raw document iterator. Used for splitting data
+    among ranks in multigpu setting or only pulling a sample of documents
+    """
+    return islice(raw_iterator, rank, limit, world_size)
+def weighted_f1_score(items):
+    from sklearn.metrics import f1_score
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore

Prism/LLaDA/LLaDA_Baseline/evaluation_script.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import torch
+import random
+import numpy as np
+from dllm_eval.__main__ import cli_evaluate
+def set_seed(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+if __name__ == "__main__":
+    os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+    os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1"
+    set_seed(42)
+    cli_evaluate()

Prism/LLaDA/LLaDA_Baseline/metrics/gsm8k_all.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import json
+import re
+import os
+import math
+import argparse
+from collections import Counter
+RES_PATH = "<PATH_TO_RESULTS_JSONL>"
+def last_boxed_only_string(string):
+    if not string: return None
+    idx = max(string.rfind("\\boxed"), string.rfind("\\fbox"))
+    if idx < 0: return None
+    if "\\boxed " in string[idx:idx+8] and "{" not in string[idx:idx+8]:
+        return "\\boxed " + string[idx:].split("\\boxed ")[-1].split("$")[0].strip()
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        elif string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+    return string[idx : right_brace_idx + 1] if right_brace_idx else None
+def remove_boxed(s):
+    if not s: return None
+    if "\\boxed " in s: return s[len("\\boxed ") :]
+    if "\\boxed{" in s and s.endswith("}"): return s[len("\\boxed{") : -1]
+    if "\\fbox{" in s and s.endswith("}"): return s[len("\\fbox{") : -1]
+    return s
+def strip_string(string):
+    if string is None: return ""
+    string = str(string).strip()
+    while re.search(r"(\d),(\d{3})", string):
+        string = re.sub(r"(\d),(\d{3})", r"\1\2", string)
+    string = string.replace("\n", "").replace("\\!", "")
+    string = string.replace("tfrac", "frac").replace("dfrac", "frac")
+    string = string.replace("\\left", "").replace("\\right", "")
+    string = string.replace("^{\\circ}", "").replace("^\\circ", "")
+    string = string.replace("\\$", "").replace("\\%", "").replace("\%", "")
+    if "=" in string and len(string.split("=")[0]) <= 5:
+        string = string.split("=")[1].strip()
+    string = string.replace(" ", "")
+    string = string.rstrip(".")
+    return string
+def normalize_to_number(s):
+    s_clean = strip_string(s)
+    try:
+        if '/' in s_clean and len(s_clean.split('/')) == 2:
+            parts = s_clean.split('/')
+            return float(parts[0]) / float(parts[1])
+        return float(s_clean)
+    except:
+        return s_clean
+def extract_answer_gsm8k_debug(text):
+    if not text: return "", "empty"
+    text = text.replace("<|role_end|>", "").replace("<|endoftext|>", "").strip()
+    boxed = last_boxed_only_string(text)
+    if boxed:
+        ans = remove_boxed(boxed)
+        if ans:
+            return strip_string(ans), "boxed"
+    tag_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
+    if tag_match:
+        return strip_string(tag_match.group(1)), "xml_tag"
+    last_text = text[-200:] if len(text) > 200 else text
+    marker = "the answer is"
+    if marker in last_text.lower():
+        idx = last_text.lower().rfind(marker)
+        after = last_text[idx + len(marker):].strip()
+        after = re.split(r"[.\n]", after)[0]
+        after = after.replace(":", "").replace("$", "").strip()
+        return strip_string(after), "text_marker"
+    tail = text[-50:]
+    nums = re.findall(r"(?<!\d)-?\d+\.?\d*(?!\d)", tail)
+    if nums:
+        return strip_string(nums[-1]), "regex_last_num"
+    return "", "failed"
+def extract_gold_gsm8k(target_str):
+    if "####" in target_str:
+        return strip_string(target_str.split("####")[-1])
+    return strip_string(target_str)
+def is_equiv(pred, gold):
+    p_val = normalize_to_number(pred)
+    g_val = normalize_to_number(gold)
+    if isinstance(p_val, float) and isinstance(g_val, float):
+        return math.isclose(p_val, g_val, rel_tol=1e-4)
+    return str(p_val) == str(g_val)
+def run_evaluation(target_path):
+    jsonl_files = []
+    if os.path.isdir(target_path):
+        for root, dirs, files in os.walk(target_path):
+            for file in files:
+                if file.endswith(".jsonl") and not file.startswith("eval_voted_"):
+                    jsonl_files.append(os.path.join(root, file))
+    else:
+        jsonl_files = [target_path]
+    for file_path in jsonl_files:
+        print(f">>> 正在评测: {file_path}")
+        detailed_results = []
+        correct_voted_count = 0
+        correct_any_count = 0
+        total_count = 0
+        nfe_list = []
+        svf_list = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                if not line.strip(): continue
+                try:
+                    item = json.loads(line)
+                except:
+                    continue
+                doc = item.get("doc", {})
+                ground_truth = extract_gold_gsm8k(str(item.get("target", "")))
+                total_nfe_item = item.get("nfe", 0)
+                nfe_list.append(total_nfe_item)
+                svf_list.append(item.get("svf_calls", 0))
+                trajectories = item.get("all_trajectories", [])
+                if not trajectories:
+                    resps = item.get("resps", [])
+                    for r in resps:
+                        text = r[0] if isinstance(r, list) else r
+                        trajectories.append({"resp": text, "score": 0.0})
+                parsed_paths = []
+                traj_debug_info = []
+                for idx, traj in enumerate(trajectories):
+                    raw_text = traj.get("resp", "")
+                    score = traj.get("score", 0.0)
+                    extracted, method = extract_answer_gsm8k_debug(raw_text)
+                    is_correct_single = False
+                    if extracted:
+                        is_correct_single = is_equiv(extracted, ground_truth)
+                        val_key = normalize_to_number(extracted)
+                        parsed_paths.append({
+                            "original_text": extracted,
+                            "val_key": val_key,
+                            "score": score,
+                            "method": method
+                        })
+                    traj_debug_info.append({
+                        "id": idx,
+                        "extracted": extracted,
+                        "score": score,
+                        "is_correct": is_correct_single,
+                        "extract_method": method
+                    })
+                if not parsed_paths:
+                    detailed_results.append({
+                        "question": doc.get("question", "N/A"),
+                        "final_voted_answer": "",
+                        "ground_truth": ground_truth,
+                        "is_voted_correct": False,
+                        "trajectory_details": traj_debug_info,
+                        "nfe": total_nfe_item,
+                        "svf_calls": item.get("svf_calls", 0)
+                    })
+                    total_count += 1
+                    continue
+                has_correct = any(p['score'] > -999 and is_equiv(p['original_text'], ground_truth) for p in parsed_paths)
+                if has_correct:
+                    correct_any_count += 1
+                parsed_paths.sort(key=lambda x: x['score'], reverse=True)
+                top_k_count = max(1, int(len(parsed_paths) * 0.6))
+                voting_candidates = parsed_paths[:top_k_count]
+                ans_stats = {}
+                for p in voting_candidates:
+                    k = p['val_key']
+                    if k not in ans_stats:
+                        ans_stats[k] = {
+                            "total_weight": 0.0,
+                            "count": 0,
+                            "max_score": -float('inf'),
+                            "best_repr": p['original_text']
+                        }
+                    try:
+                        weight = math.exp(p['score'])
+                    except OverflowError:
+                        weight = float('inf')
+                    ans_stats[k]["total_weight"] += weight
+                    ans_stats[k]["count"] += 1
+                    if p['score'] > ans_stats[k]["max_score"]:
+                        ans_stats[k]["max_score"] = p['score']
+                        ans_stats[k]["best_repr"] = p['original_text']
+                sorted_answers = sorted(
+                    ans_stats.items(),
+                    key=lambda x: (x[1]["total_weight"], x[1]["max_score"]),
+                    reverse=True
+                )
+                best_pred = str(sorted_answers[0][1]["best_repr"])
+                is_voted_correct = is_equiv(best_pred, ground_truth)
+                if is_voted_correct:
+                    correct_voted_count += 1
+                vote_summary = []
+                for val, info in sorted_answers:
+                    vote_summary.append({
+                        "answer": str(val),
+                        "count": info["count"],
+                        "total_weight": info["total_weight"],
+                        "is_correct": is_equiv(str(val), ground_truth)
+                    })
+                total_count += 1
+                detailed_results.append({
+                    "question": doc.get("question", "N/A"),
+                    "final_voted_answer": best_pred,
+                    "ground_truth": ground_truth,
+                    "is_voted_correct": is_voted_correct,
+                    "vote_stats": vote_summary,
+                    "trajectory_details": traj_debug_info,
+                    "nfe": total_nfe_item,
+                    "svf_calls": item.get("svf_calls", 0)
+                })
+        accuracy = (correct_voted_count / total_count * 100) if total_count > 0 else 0
+        pass_at_k = (correct_any_count / total_count * 100) if total_count > 0 else 0
+        avg_nfe = int(round(sum(nfe_list) / len(nfe_list))) if nfe_list else 0
+        avg_svf = int(round(sum(svf_list) / len(svf_list))) if svf_list else 0
+        print(f"--- Accuracy: {accuracy:.2f}% | NFE: {avg_nfe} | SVF: {avg_svf} ---")
+        output_name = f"eval_voted_{os.path.basename(file_path).replace('.jsonl', '.json')}"
+        output_path = os.path.join(os.path.dirname(file_path), output_name)
+        final_report = {
+            "summary": {
+                "accuracy": f"{accuracy:.2f}%",
+                "correct_voted": correct_voted_count,
+                "total": total_count,
+                "nfe": avg_nfe,
+                "svf_calls": avg_svf
+            },
+            "details": detailed_results
+        }
+        with open(output_path, 'w', encoding='utf-8') as out_f:
+            json.dump(final_report, out_f, ensure_ascii=False, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-r", "--res_path", type=str, default=RES_PATH)
+    args = parser.parse_args()
+    run_evaluation(args.res_path)

Prism/LLaDA/LLaDA_Baseline/metrics/humaneval_all.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import os
+import sys
+import json
+import ast
+import traceback
+import glob
+import math
+import argparse
+from typing import Dict, List, Optional, Set, Tuple
+from collections import Counter
+import evaluate as hf_evaluate
+import re
+RES_PATH = "<PATH_TO_RESULTS_JSONL>"
+os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+def extract_python_code(text: str) -> str:
+    if not text: return ""
+    text = text.replace("<|role_end|>", "").replace("<|endoftext|>", "").replace("<|notification_end|>", "")
+    tag_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
+    if tag_match:
+        text = tag_match.group(1)
+    if "```python" in text:
+        content = text.split("```python")[-1]
+        if "```" in content:
+            return content.split("```")[0].strip()
+        return content.strip()
+    elif "```" in text:
+        content = text.split("```")[-1]
+        if "```" in content:
+            return content.split("```")[0].strip()
+        return content.strip()
+    lines = text.split('\n')
+    cleaned_lines = []
+    stop_words = ["Explanation:", "Example:", "Test Case:", "Output:"]
+    for line in lines:
+        if any(sw in line for sw in stop_words):
+            break
+        cleaned_lines.append(line)
+    return "\n".join(cleaned_lines).strip()
+def normalize_code_for_voting(code: str) -> str:
+    try:
+        tree = ast.parse(code)
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.Module)):
+                if (node.body and isinstance(node.body[0], ast.Expr) and
+                    isinstance(node.body[0].value, ast.Constant) and isinstance(node.body[0].value.value, str)):
+                    node.body.pop(0)
+        return ast.unparse(tree).strip()
+    except:
+        return re.sub(r"\s+", "", code)
+def sanitize(prompt: str, completion: str, entrypoint: str) -> str:
+    if f"def {entrypoint}" in completion:
+        return completion
+    return prompt + "\n" + completion
+def run_evaluation(target_path):
+    if os.path.isdir(target_path):
+        jsonl_files = glob.glob(os.path.join(target_path, "**/*.jsonl"), recursive=True)
+    else:
+        jsonl_files = [target_path]
+    if not jsonl_files:
+        print(f"未在路径 {target_path} 下找到任何 .jsonl 文件")
+        return
+    print(f"共找到 {len(jsonl_files)} 个评测任务")
+    code_eval = hf_evaluate.load("code_eval")
+    for file_path in jsonl_files:
+        print(f"\n>>> 正在评测: {file_path}")
+        all_predictions = []
+        all_references = []
+        detailed_results = []
+        nfe_list = []
+        svf_list = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+            if not lines: continue
+            for line in lines:
+                if not line.strip(): continue
+                item = json.loads(line)
+                doc = item.get("doc", {})
+                prompt = doc.get("prompt", "")
+                entry_point = doc.get("entry_point", "")
+                reference = doc.get("test", "")
+                current_nfe = item.get("nfe", 0)
+                nfe_list.append(current_nfe)
+                svf_list.append(item.get("svf_calls", 0))
+                resps = item.get("resps", [])
+                candidate_stats = {}
+                for r in resps:
+                    raw_text = r[0] if isinstance(r, list) else r
+                    completion = extract_python_code(raw_text)
+                    full_code = sanitize(prompt, completion, entry_point)
+                    try:
+                        ast.parse(full_code)
+                        is_valid = True
+                    except:
+                        is_valid = False
+                    logic_norm = normalize_code_for_voting(full_code)
+                    if not logic_norm: continue
+                    if logic_norm not in candidate_stats:
+                        candidate_stats[logic_norm] = {"count": 0, "valid": is_valid, "code": full_code}
+                    candidate_stats[logic_norm]["count"] += 1
+                if not candidate_stats:
+                    voted_code = prompt
+                else:
+                    sorted_logics = sorted(
+                        candidate_stats.keys(),
+                        key=lambda k: (candidate_stats[k]["valid"], candidate_stats[k]["count"]),
+                        reverse=True
+                    )
+                    voted_code = candidate_stats[sorted_logics[0]]["code"]
+                all_predictions.append([voted_code])
+                all_references.append(reference)
+                detailed_results.append({
+                    "task_id": doc.get("task_id", doc.get("name", "N/A")),
+                    "voted_code": voted_code,
+                    "nfe": current_nfe,
+                    "svf_calls": item.get("svf_calls", 0),
+                    "candidates_count": len(candidate_stats)
+                })
+        if not all_predictions: continue
+        print(f"正在执行代码测试 (共 {len(all_predictions)} 题)...")
+        pass_at_k, exec_results = code_eval.compute(
+            references=all_references,
+            predictions=all_predictions,
+            k=[1],
+            num_workers=4
+        )
+        accuracy = pass_at_k.get("pass@1", 0.0) * 100
+        avg_nfe = int(round(sum(nfe_list) / len(nfe_list))) if nfe_list else 0
+        avg_svf = int(round(sum(svf_list) / len(svf_list))) if svf_list else 0
+        print(f"Accuracy: {accuracy:.2f}% | NFE: {avg_nfe} | SVF: {avg_svf} ---")
+        output_name = f"eval_voted_{os.path.basename(file_path).replace('.jsonl', '.json')}"
+        output_path = os.path.join(os.path.dirname(file_path), output_name)
+        for i, detail in enumerate(detailed_results):
+            res_list = exec_results.get(i, [])
+            detail["is_correct"] = res_list[0][1]["passed"] if res_list else False
+        final_report = {
+            "summary": {
+                "accuracy": f"{accuracy:.2f}%",
+                "nfe": avg_nfe,
+                "svf_calls": avg_svf
+            },
+            "details": detailed_results
+        }
+        with open(output_path, 'w', encoding='utf-8') as out_f:
+            json.dump(final_report, out_f, ensure_ascii=False, indent=4)
+        print(f"报告已保存至: {output_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-r", "--res_path", type=str, default=RES_PATH)
+    args = parser.parse_args()
+    run_evaluation(args.res_path)

Prism/LLaDA/LLaDA_Baseline/metrics/math500_all.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import json
+import re
+import os
+import math
+import argparse
+from collections import Counter
+RES_PATH = "<PATH_TO_RESULTS_JSONL>"
+def extract_answer(text):
+    if not text:
+        return "", False
+    text = text.replace("<|role_end|>", "").replace("<|endoftext|>", "").strip()
+    boxed_pattern = r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}"
+    all_boxes = re.findall(boxed_pattern, text)
+    if all_boxes:
+        return all_boxes[-1], True
+    tag_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
+    if tag_match:
+        return tag_match.group(1).strip(), True
+    marker = "the answer is"
+    if marker in text.lower():
+        pos = text.lower().rfind(marker)
+        after_text = text[pos + len(marker):].strip()
+        after_text = re.sub(r"^[:\s]+", "", after_text)
+        return after_text.split('\n')[0].split('$')[0].strip(), True
+    tail = text[-50:].strip()
+    nums = re.findall(r"(-?\d+[\./\d]*|\\sqrt\{\d+\}|\(-?\d+.*?\))", tail)
+    if nums:
+        return nums[-1], False
+    return "", False
+def normalize_math(string):
+    if not string: return ""
+    string = str(string).lower().strip()
+    string = string.replace("</reasoning>", "").replace("</answer>", "").replace("<answer>", "")
+    string = string.replace("...", "").replace("cannot be determined", "")
+    string = re.sub(r"([a-z]+|\\theta|\\alpha|\\pi)\s*=\s*", "", string)
+    string = re.sub(r"\\text\{([^}]*)\}", r"\1", string)
+    string = re.sub(r"\\(mathbf|mathrm|bold|unit|mbox|operatorname|mathrm)\{([^}]*)\}", r"\2", string)
+    string = re.sub(r"\\(d|t)?frac\{([^{}]*)\}\{([^{}]*)\}", r"\2/\3", string)
+    string = string.replace("\\!", "").replace("\\ ", "").replace("{", "").replace("}", "")
+    string = string.replace("\\left", "").replace("\\right", "")
+    string = string.replace("\\$", "").replace("$", "").replace("\\%", "").replace("%", "")
+    units_pattern = r"(units?|cm\^2|cm|inches|inch|square|degrees?|radians?|miles?|per|hour|cents?)"
+    string = re.sub(units_pattern, "", string)
+    string = string.replace("^{\\circ}", "").replace("^\\circ", "").replace("°", "").replace("\\degree", "")
+    string = string.replace("\\pi", "pi")
+    string = re.sub(r"(\d),(\d{3})", r"\1\2", string)
+    string = string.rstrip(".:,; ").replace(" ", "")
+    if "=" in string:
+        string = string.split("=")[-1]
+    return string
+def is_equiv(pred, gold):
+    if not pred: return False
+    p, g = normalize_math(pred), normalize_math(gold)
+    if p == g: return True
+    if "=" in pred:
+        if normalize_math(pred.split("=")[-1]) == g:
+            return True
+    try:
+        def to_float(s):
+            if '/' in s and s.count('/') == 1:
+                parts = s.split('/')
+                return float(parts[0]) / float(parts[1])
+            if '_' in s: s = s.split('_')[0]
+            return float(s)
+        return math.isclose(to_float(p), to_float(g), rel_tol=1e-4)
+    except:
+        p_fuzzy = re.sub(r"[^a-z0-9/,\-]", "", p)
+        g_fuzzy = re.sub(r"[^a-z0-9/,\-]", "", g)
+        return p_fuzzy == g_fuzzy if p_fuzzy else False
+def run_evaluation(target_path):
+    jsonl_files = []
+    if os.path.isdir(target_path):
+        for root, dirs, files in os.walk(target_path):
+            for file in files:
+                if file.endswith(".jsonl") and not file.startswith("eval_voted_"):
+                    jsonl_files.append(os.path.join(root, file))
+    else:
+        jsonl_files = [target_path]
+    for file_path in jsonl_files:
+        print(f">>> 正在评测: {file_path}")
+        detailed_results = []
+        voted_correct_count = 0
+        pass_at_k_count = 0
+        total_count = 0
+        nfe_list = []
+        svf_list = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                if not line.strip(): continue
+                try:
+                    item = json.loads(line)
+                except:
+                    continue
+                doc = item.get("doc", {})
+                ground_truth = str(item.get("target", doc.get("answer", "")))
+                current_nfe = item.get("nfe", 0)
+                nfe_list.append(current_nfe)
+                current_svf = item.get("svf_calls", 0)
+                svf_list.append(current_svf)
+                ans_stats = {}
+                trajectories = item.get("all_trajectories", [])
+                has_correct_trajectory = False
+                for traj in trajectories:
+                    raw_text = traj.get("resp", "")
+                    score = traj.get("score", 0)
+                    extracted, _ = extract_answer(raw_text)
+                    if not extracted: continue
+                    if is_equiv(extracted, ground_truth):
+                        has_correct_trajectory = True
+                    norm = normalize_math(extracted)
+                    if norm not in ans_stats:
+                        ans_stats[norm] = {
+                            "count": 0,
+                            "max_score": -float('inf'),
+                            "total_weight": 0.0,
+                            "original": extracted
+                        }
+                    ans_stats[norm]["count"] += 1
+                    if score > ans_stats[norm]["max_score"]:
+                        ans_stats[norm]["max_score"] = score
+                    try:
+                        weight = math.exp(score)
+                    except OverflowError:
+                        weight = float('inf')
+                    ans_stats[norm]["total_weight"] += weight
+                if has_correct_trajectory:
+                    pass_at_k_count += 1
+                if not ans_stats:
+                    best_pred = ""
+                else:
+                    sorted_norms = sorted(
+                        ans_stats.keys(),
+                        key=lambda x: (ans_stats[x]["total_weight"], ans_stats[x]["max_score"], ans_stats[x]["count"]),
+                        reverse=True
+                    )
+                    best_norm = sorted_norms[0]
+                    best_pred = ans_stats[best_norm]["original"]
+                is_voted_correct = False
+                if best_pred and is_equiv(best_pred, ground_truth):
+                    voted_correct_count += 1
+                    is_voted_correct = True
+                total_count += 1
+                detailed_results.append({
+                    "question": doc.get("problem", "N/A"),
+                    "final_voted_answer": best_pred,
+                    "ground_truth": ground_truth,
+                    "is_voted_correct": is_voted_correct,
+                    "nfe": current_nfe,
+                    "svf_calls": current_svf
+                })
+        pass_at_1_accuracy = (voted_correct_count / total_count * 100) if total_count > 0 else 0
+        avg_nfe = int(round(sum(nfe_list) / len(nfe_list))) if nfe_list else 0
+        avg_svf = int(round(sum(svf_list) / len(svf_list))) if svf_list else 0
+        print(f"---  Accuracy: {pass_at_1_accuracy:.2f}% | NFE: {avg_nfe} | SVF: {avg_svf} ---")
+        output_name = f"eval_voted_{os.path.basename(file_path).replace('.jsonl', '.json')}"
+        output_path = os.path.join(os.path.dirname(file_path), output_name)
+        final_report = {
+            "summary": {
+                "accuracy": f"{pass_at_1_accuracy:.2f}%",
+                "correct_voted_count": voted_correct_count,
+                "total": total_count,
+                "nfe": avg_nfe,
+                "svf_calls": avg_svf
+            },
+            "details": detailed_results
+        }
+        with open(output_path, 'w', encoding='utf-8') as out_f:
+            json.dump(final_report, out_f, ensure_ascii=False, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-r", "--res_path", type=str, default=RES_PATH)
+    args = parser.parse_args()
+    run_evaluation(args.res_path)

Prism/LLaDA/LLaDA_Baseline/metrics/mbpp_all.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import os
+import json
+import ast
+import glob
+import re
+import argparse
+from typing import Dict, List, Optional, Set, Tuple
+import evaluate as hf_evaluate
+RES_PATH = "<PATH_TO_RESULTS_JSONL>"
+os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def extract_python_code(text: str) -> str:
+    if not text: return ""
+    text = text.replace("<|role_end|>", "").replace("<|endoftext|>", "").replace("<|notification_end|>", "")
+    tag_matches = re.findall(r"<answer>(.*?)</answer>", text, re.DOTALL)
+    if tag_matches:
+        for block in tag_matches:
+            if "def " in block:
+                text = block
+                break
+        else:
+            text = tag_matches[0]
+    if "```python" in text:
+        blocks = text.split("```python")
+        for b in blocks[1:]:
+            code = b.split("```")[0].strip()
+            if "def " in code: return code
+    elif "```" in text:
+        blocks = text.split("```")
+        for b in blocks[1:]:
+            code = b.strip()
+            if "def " in code: return code
+    lines = text.split('\n')
+    cleaned_lines = []
+    stop_words = ["Explanation:", "Example:", "Test Case:", "Output:", "Reasoning:"]
+    for line in lines:
+        if any(sw in line for sw in stop_words): break
+        cleaned_lines.append(line)
+    return "\n".join(cleaned_lines).strip()
+def normalize_code_for_voting(code: str) -> str:
+    try:
+        tree = ast.parse(code)
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.Module)):
+                if (node.body and isinstance(node.body[0], ast.Expr) and
+                    isinstance(node.body[0].value, ast.Constant) and isinstance(node.body[0].value.value, str)):
+                    node.body.pop(0)
+        return ast.unparse(tree).strip()
+    except:
+        return re.sub(r"\s+", "", code)
+def run_evaluation(target_path):
+    target_path = os.path.abspath(target_path)
+    if os.path.isdir(target_path):
+        search_pattern = os.path.join(target_path, "**/*.jsonl")
+        jsonl_files = glob.glob(search_pattern, recursive=True)
+        jsonl_files = [f for f in jsonl_files if not os.path.basename(f).startswith("eval_mbpp_")]
+    else:
+        jsonl_files = [target_path]
+    if not jsonl_files:
+        print(f"Error: 在路径 {target_path} 及其子目录下未找到任何 .jsonl 文件。")
+        return
+    try:
+        code_eval = hf_evaluate.load("code_eval")
+    except:
+        print("Error: Could not load code_eval. Ensure 'evaluate' and 'code_eval' are installed.")
+        return
+    for file_path in jsonl_files:
+        print(f"\n>>> 正在评测 MBPP 文件: {file_path}")
+        all_candidate_predictions = []
+        all_voted_predictions = []
+        all_references = []
+        detailed_results = []
+        nfe_list = []
+        svf_list = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                if not line.strip(): continue
+                item = json.loads(line)
+                doc = item.get("doc", {})
+                test_list = doc.get("test_list", [])
+                test_setup = doc.get("test_setup_code", "")
+                full_reference = (test_setup + "\n" + "\n".join(test_list)).strip()
+                item_nfe = item.get("nfe", 0)
+                item_svf = item.get("svf_calls", 0)
+                nfe_list.append(item_nfe)
+                svf_list.append(item_svf)
+                resps = item.get("resps", [])
+                trajs = item.get("all_trajectories", [])
+                candidate_stats = {}
+                processed_candidates = []
+                source_data = trajs if trajs else resps
+                for idx, entry in enumerate(source_data):
+                    raw_text = entry.get("resp", "") if isinstance(entry, dict) else (entry[0] if isinstance(entry, list) else entry)
+                    score = entry.get("score", 0) if isinstance(entry, dict) else 0
+                    code = extract_python_code(raw_text)
+                    if not code: continue
+                    processed_candidates.append(code)
+                    try:
+                        ast.parse(code)
+                        is_valid = True
+                    except:
+                        is_valid = False
+                    norm = normalize_code_for_voting(code)
+                    if norm not in candidate_stats:
+                        candidate_stats[norm] = {"count": 0, "valid": is_valid, "code": code, "max_score": -float('inf')}
+                    candidate_stats[norm]["count"] += 1
+                    candidate_stats[norm]["max_score"] = max(candidate_stats[norm]["max_score"], score)
+                if not candidate_stats:
+                    voted_code = ""
+                else:
+                    sorted_norms = sorted(
+                        candidate_stats.keys(),
+                        key=lambda k: (candidate_stats[k]["valid"], candidate_stats[k]["max_score"], candidate_stats[k]["count"]),
+                        reverse=True
+                    )
+                    voted_code = candidate_stats[sorted_norms[0]]["code"]
+                all_candidate_predictions.append(processed_candidates if processed_candidates else [""])
+                all_voted_predictions.append([voted_code])
+                all_references.append(full_reference)
+                detailed_results.append({
+                    "task_id": doc.get("task_id", "N/A"),
+                    "voted_code": voted_code,
+                    "nfe": item_nfe,
+                    "svf_calls": item_svf,
+                    "candidates_count": len(processed_candidates)
+                })
+        if not all_voted_predictions:
+            continue
+        print(f"正在测试代码 (共 {len(all_voted_predictions)} 题)...")
+        res_voted, details_voted = code_eval.compute(references=all_references, predictions=all_voted_predictions, k=[1])
+        res_pk, details_pk = code_eval.compute(references=all_references, predictions=all_candidate_predictions, k=[1])
+        acc_voted = res_voted.get("pass@1", 0.0) * 100
+        acc_pk = res_pk.get("pass@1", 0.0) * 100
+        avg_nfe = int(round(sum(nfe_list) / len(nfe_list))) if nfe_list else 0
+        avg_svf = int(round(sum(svf_list) / len(svf_list))) if svf_list else 0
+        print(f"--- Pass@1: {acc_voted:.2f}% | NFE: {avg_nfe} | SVF: {avg_svf} ---")
+        for i, detail in enumerate(detailed_results):
+            detail["is_voted_correct"] = details_voted.get(i, [[0, {"passed": False}]])[0][1]["passed"]
+        file_dir = os.path.dirname(file_path)
+        base_name = os.path.basename(file_path)
+        output_name = f"eval_mbpp_{base_name.replace('.jsonl', '.json')}"
+        output_path = os.path.join(file_dir, output_name)
+        final_report = {
+            "summary": {
+                "pass_at_1": f"{acc_voted:.2f}%",
+                "avg_nfe": avg_nfe,
+                "avg_svf": avg_svf
+            },
+            "details": detailed_results
+        }
+        with open(output_path, 'w', encoding='utf-8') as out_f:
+            json.dump(final_report, out_f, ensure_ascii=False, indent=4)
+        print(f"成功保存结果至: {output_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-r", "--res_path", type=str, default=RES_PATH)
+    args = parser.parse_args()
+    run_evaluation(args.res_path)

Prism/LLaDA/LLaDA_Baseline/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+sacrebleu
+evaluate
+datasets
+numpy
+pandas
+tqdm
+regex
+sqlitedict
+pytablewriter

Prism/LLaDA/LLaDA_Baseline/scripts/run_gsm8k.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/bin/bash
+set -e
+set -x
+PROJECT_ROOT="<PATH_TO_YOUR_ROOT>"
+cd "$PROJECT_ROOT"
+MODEL_PATH="<PATH_TO_YOUR_LLaDA_8B_INSTRUCT_WEIGHTS>"
+BASE_OUTPUT_PATH="${PROJECT_ROOT}/outputs/results_gsm8k"
+export CUDA_VISIBLE_DEVICES=0
+export HF_ENDPOINT=https://hf-mirror.com
+LENGTH=256
+STEPS=32
+BLOCK=32
+TASK="gsm8k"
+NAME="baseline"
+mkdir -p "${BASE_OUTPUT_PATH}/${NAME}"
+accelerate launch evaluation_script.py \
+    --model LLaDA \
+    --tasks ${TASK} \
+    --batch_size 1 \
+    --model_args "pretrained=${MODEL_PATH},mask_id=126336,assistant_prefix=<reasoning>" \
+    --gen_kwargs "use_hts=True,hts_N=1,hts_mode=False,steps=${STEPS},block_length=${BLOCK},gen_length=${LENGTH},task_type=math,temperature=0.7,realtime_output=${BASE_OUTPUT_PATH}/${NAME}/baseline.jsonl" \
+    --num_fewshot 0 \
+    --confirm_run_unsafe_code \
+    --output_path "${BASE_OUTPUT_PATH}/${NAME}"

Prism/LLaDA/LLaDA_Baseline/scripts/run_humaneval.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/bin/bash
+set -e
+set -x
+PROJECT_ROOT="<PATH_TO_YOUR_ROOT>"
+MODEL_PATH="<PATH_TO_YOUR_LLaDA_8B_INSTRUCT_WEIGHTS>"
+BASE_OUTPUT_PATH="${PROJECT_ROOT}/outputs/results_humaneval"
+cd "$PROJECT_ROOT"
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export HF_ENDPOINT=https://hf-mirror.com
+LENGTH=512
+STEPS=32
+BLOCK=32
+TASK="humaneval"
+NAME="baseline"
+mkdir -p "${BASE_OUTPUT_PATH}/${NAME}"
+accelerate launch evaluation_script.py \
+    --model LLaDA \
+    --tasks ${TASK} \
+    --batch_size 1 \
+    --model_args "pretrained=${MODEL_PATH},mask_id=126336,assistant_prefix=<reasoning>" \
+    --gen_kwargs "use_hts=True,hts_N=1,hts_mode=False,steps=${STEPS},block_length=${BLOCK},gen_length=${LENGTH},task_type=code,temperature=0.7,realtime_output=${BASE_OUTPUT_PATH}/${NAME}/baseline.jsonl" \
+    --num_fewshot 0 \
+    --confirm_run_unsafe_code \
+    --output_path "${BASE_OUTPUT_PATH}/${NAME}"

Prism/LLaDA/LLaDA_Baseline/scripts/run_math500.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/bin/bash
+set -e
+set -x
+PROJECT_ROOT="<PATH_TO_YOUR_PRISM_ROOT>"
+MODEL_PATH="<PATH_TO_YOUR_LLaDA_8B_INSTRUCT_WEIGHTS>"
+BASE_OUTPUT_PATH="${PROJECT_ROOT}/outputs/results_math500"
+cd "$PROJECT_ROOT"
+export CUDA_VISIBLE_DEVICES=0
+export HF_ENDPOINT=https://hf-mirror.com
+LENGTH=256
+STEPS=32
+BLOCK=32
+TASK="math500"
+NAME="baseline"
+mkdir -p "${BASE_OUTPUT_PATH}/${NAME}"
+accelerate launch evaluation_script.py \
+    --model LLaDA \
+    --tasks ${TASK} \
+    --batch_size 1 \
+    --model_args "pretrained=${MODEL_PATH},mask_id=126336,assistant_prefix=<reasoning>" \
+    --gen_kwargs "use_hts=True,hts_N=1,hts_mode=False,steps=${STEPS},block_length=${BLOCK},gen_length=${LENGTH},task_type=math,temperature=0.7,realtime_output=${BASE_OUTPUT_PATH}/${NAME}/baseline.jsonl" \
+    --num_fewshot 0 \
+    --confirm_run_unsafe_code \
+    --output_path "${BASE_OUTPUT_PATH}/${NAME}"

Prism/LLaDA/LLaDA_Baseline/scripts/run_mbpp.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/bin/bash
+set -e
+set -x
+PROJECT_ROOT="<PATH_TO_YOUR_PRISM_ROOT>"
+MODEL_PATH="<PATH_TO_YOUR_LLaDA_8B_INSTRUCT_WEIGHTS>"
+BASE_OUTPUT_PATH="${PROJECT_ROOT}/outputs/results_mbpp_k4"
+cd "$PROJECT_ROOT"
+export CUDA_VISIBLE_DEVICES=0
+export HF_ENDPOINT=https://hf-mirror.com
+LENGTH=512
+STEPS=32
+BLOCK=32
+TASK="mbpp"
+NAME="baseline"
+mkdir -p "${BASE_OUTPUT_PATH}/${NAME}"
+accelerate launch evaluation_script.py \
+    --model LLaDA \
+    --tasks ${TASK} \
+    --batch_size 1 \
+    --model_args "pretrained=${MODEL_PATH},mask_id=126336,assistant_prefix=<reasoning>" \
+    --gen_kwargs "use_hts=True,hts_N=1,hts_mode=False,steps=${STEPS},block_length=${BLOCK},gen_length=${LENGTH},task_type=math,temperature=0.7,realtime_output=${BASE_OUTPUT_PATH}/${NAME}/baseline.jsonl" \
+    --num_fewshot 0 \
+    --confirm_run_unsafe_code \
+    --output_path "${BASE_OUTPUT_PATH}/${NAME}"

Prism/LLaDA/LLaDA_Prism/.gitignore ADDED Viewed

	@@ -0,0 +1,210 @@

+*.jsonl
+*.json
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/

Prism/LLaDA/LLaDA_Prism/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 preordinary
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Prism/LLaDA/LLaDA_Prism/evaluation_script.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import torch
+import random
+import numpy as np
+from dllm_eval.__main__ import cli_evaluate
+def set_seed(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+if __name__ == "__main__":
+    os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+    os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "1"
+    set_seed(42)
+    cli_evaluate()

Prism/LLaDA/LLaDA_Prism/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+sacrebleu
+evaluate
+datasets
+numpy
+pandas
+tqdm
+regex
+sqlitedict
+pytablewriter

Prism/LLaDA/LLaDA_Truthfulqa/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+outputs
+logs
+LLaDA-8B-Instruct

Prism/LLaDA/LLaDA_Truthfulqa/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Prism/LLaDA/LLaDA_Truthfulqa/eval_llada.py ADDED Viewed

	@@ -0,0 +1,413 @@

+'''
+This file is inspired by the code from https://github.com/ML-GSAI/SMDM
+'''
+import accelerate
+import torch
+import re
+from pathlib import Path
+import random
+import numpy as np
+import torch.nn.functional as F
+from datasets import Dataset
+from lm_eval.__main__ import cli_evaluate
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel
+import json
+import os
+import time
+def set_seed(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def _sample_categorical(categorical_probs):
+  gumbel_norm = (
+    1e-10
+    - (torch.rand_like(categorical_probs) + 1e-10).log()).to(categorical_probs.dtype)
+  return (categorical_probs / gumbel_norm).argmax(dim=-1)
+@register_model("llada_dist")
+class LLaDAEvalHarness(LM):
+    def __init__(
+        self,
+        model_path='',
+        mask_id=126336,
+        max_length=4096,
+        generated_samples_path='',
+        batch_size=32,
+        mc_num=128,
+        is_check_greedy=True,
+        cfg=0.,
+        sampling_steps=512,
+        mask_length=512,
+        block_size=32,
+        remasking='low_confidence',
+        device="cuda",
+        sampler='',
+        remdm_number=0
+    ):
+        '''
+        Args:
+            model_path: LLaDA-8B-Base model path.
+            mask_id: The token id of [MASK] is 126336.
+            max_length: the max sequence length.
+            batch_size: mini batch size.
+            mc_num: Monte Carlo estimation iterations
+            is_check_greedy: For certain metrics like LAMBADA, the evaluation requires the model to verify whether the answer
+                             is generated through greedy sampling conditioned on the prompt (note that this differs from conditional
+                             generation). We implement this verification through the suffix_greedy_prediction() function, which
+                             returns a True/False judgment used for accuracy calculation.
+                             When is_check_greedy is set to True, the lm-evaluation-harness library automatically invokes this function.
+                             However, since none of the metrics in the LLaDA paper (https://arxiv.org/abs/2502.09992) require this functionality,
+                             we recommend setting is_check_greedy to False. This configuration causes suffix_greedy_prediction() to return False
+                             by default, significantly accelerating the evaluation process.
+            cfg_scale: Unsupervised classifier-free guidance scale.
+        '''
+        super().__init__()
+        accelerator = accelerate.Accelerator()
+        if accelerator.num_processes > 1:
+            self.accelerator = accelerator
+        else:
+            self.accelerator = None
+        model_kwargs = {}
+        if self.accelerator is not None:
+            model_kwargs.update({'device_map': {'': f'{self.accelerator.device}'}})
+        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16, **model_kwargs)
+        self.model.eval()
+        self.device = torch.device(device)
+        if self.accelerator is not None:
+            self.model = self.accelerator.prepare(self.model)
+            self.device = torch.device(f'{self.accelerator.device}')
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        else:
+            self.model = self.model.to(device)
+        self.mask_id = mask_id
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.mc_num = mc_num
+        self.batch_size = int(batch_size)
+        assert mc_num % self.batch_size == 0
+        self.sampling_eps = 0.
+        self.max_length = max_length
+        self.is_check_greedy = is_check_greedy
+        self.generated_samples_path = generated_samples_path
+        self.sampler = sampler
+        self.remdm_number = remdm_number
+        self.cfg = cfg
+        self.sampling_steps = sampling_steps
+        self.mask_length = mask_length
+        self.block_size = block_size
+        self.remasking = remasking
+        print(self.generated_samples_path)
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
+    def _forward_process(self, batch, prompt_index):
+        b, l = batch.shape
+        target_len = (l - prompt_index.sum()).item()
+        k = torch.randint(1, target_len + 1, (), device=batch.device)
+        x = torch.round(torch.linspace(float(k), k + (b - 1) * (target_len / b), steps=b, device=batch.device)).long()
+        x = ((x - 1) % target_len) + 1
+        assert x.min() >= 1 and x.max() <= target_len
+        indices = torch.arange(target_len, device=batch.device).repeat(b, 1)
+        is_mask = indices < x.unsqueeze(1)
+        for i in range(b):
+            is_mask[i] = is_mask[i][torch.randperm(target_len)]
+        is_mask = torch.cat((torch.zeros(b, prompt_index.sum(), dtype=torch.bool, device=batch.device), is_mask), dim=1)
+        noisy_batch = torch.where(is_mask, self.mask_id, batch)
+        return noisy_batch, (x / target_len).unsqueeze(1).repeat(1, l)
+    @torch.no_grad()
+    def get_logits(self, batch, prompt_index):
+        if self.cfg > 0.:
+            assert len(prompt_index) == batch.shape[1]
+            prompt_index = prompt_index.unsqueeze(0).repeat(batch.shape[0], 1)
+            un_batch = batch.clone()
+            un_batch[prompt_index] = self.mask_id
+            batch = torch.cat([batch, un_batch])
+        logits = self.model(batch).logits
+        if self.cfg > 0.:
+            logits, un_logits = torch.chunk(logits, 2, dim=0)
+            logits = un_logits + (self.cfg + 1) * (logits - un_logits)
+        return logits[:, :batch.shape[1]]
+    @torch.no_grad()
+    def get_loglikelihood(self, prefix, target):
+        seq = torch.concatenate([prefix, target])[None, :]
+        seq = seq.repeat((self.batch_size, 1)).to(self.device)
+        prompt_index = torch.arange(seq.shape[1], device=self.device) < len(prefix)
+        loss_acc = []
+        for _ in range(self.mc_num // self.batch_size):
+            perturbed_seq, p_mask = self._forward_process(seq, prompt_index)
+            mask_indices = perturbed_seq == self.mask_id
+            logits = self.get_logits(perturbed_seq, prompt_index)
+            loss = F.cross_entropy(logits[mask_indices], seq[mask_indices], reduction='none') / p_mask[mask_indices]
+            loss = loss.sum() / self.batch_size
+            loss_acc.append(loss.item())
+        return - sum(loss_acc) / len(loss_acc)
+    @torch.no_grad()
+    def suffix_greedy_prediction(self, prefix, target):
+        if not self.is_check_greedy:
+            return False
+        seq = torch.full((1, len(prefix) + len(target)), self.mask_id, device=self.device)
+        prompt_index = torch.arange(seq.shape[1], device=self.device) < len(prefix)
+        prefix, target = prefix.to(self.device), target.to(self.device)
+        seq[0, :len(prefix)] = prefix
+        for i in range(len(target)):
+            mask_index = (seq == self.mask_id)
+            logits = self.get_logits(seq, prompt_index)[mask_index]
+            x0 = torch.argmax(logits, dim=-1)
+            p = torch.softmax(logits.to(torch.float32), dim=-1)
+            confidence = torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)).squeeze(dim=-1)
+            _, index = torch.sort(confidence, descending=True)
+            x0[index[1:]] = self.mask_id
+            seq[mask_index] = x0.clone()
+        correct = target == seq[0, len(prefix):]
+        correct = torch.all(correct)
+        return correct
+    def _encode_pair(self, context, continuation):
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tokenizer(context + continuation)["input_ids"]
+        context_enc = self.tokenizer(context)["input_ids"]
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+    def loglikelihood(self, requests):
+        def _tokenize(e):
+            prefix, target = self._encode_pair(e["prefix"], e["target"])
+            return {
+                "prefix_text": e["prefix"],
+                "target_text": e["target"],
+                "prefix": prefix,
+                "target": target,
+            }
+        ds = []
+        ds = [{"prefix": req.args[0], "target": req.args[1]} for req in requests]
+        ds = Dataset.from_list(ds)
+        ds = ds.map(_tokenize)
+        ds = ds.with_format("torch")
+        prompt_len = [len(x["prefix"]) + len(x["target"]) for x in ds]
+        assert max(prompt_len) <= 4096
+        out = []
+        with torch.no_grad():
+            for elem in tqdm(ds, desc="Computing likelihood..."):
+                prefix = elem["prefix"]
+                target = elem["target"]
+                ll = self.get_loglikelihood(prefix, target)
+                is_target_greedy_dec = self.suffix_greedy_prediction(prefix, target)
+                out.append((ll, 1.0 if is_target_greedy_dec else 0.0))
+        torch.cuda.empty_cache()
+        return out
+    def loglikelihood_rolling(self, requests):
+        raise NotImplementedError
+    @torch.no_grad()
+    def llada_conf_sample(self, prompt):
+        xt = torch.full((1, prompt.shape[1] + self.mask_length), self.mask_id, dtype=torch.long).to(self.model.device)
+        xt[:, :prompt.shape[1]] = prompt.clone()
+        prompt_index = (xt != self.mask_id)
+        prompt_len = prompt_index.sum(1).item()
+        assert self.mask_length % self.block_size == 0
+        num_blocks = self.mask_length // self.block_size
+        assert self.sampling_steps % num_blocks == 0
+        steps = self.sampling_steps // num_blocks
+        assert self.mask_length % self.sampling_steps == 0
+        for num_block in range(num_blocks):
+            for i in range(steps):
+                mask_index = (xt == self.mask_id)
+                logits = self.model(xt).logits
+                p = F.softmax(logits.to(torch.float64), dim=-1)
+                x0 = _sample_categorical(p)
+                x0_p = torch.squeeze(torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
+                x0_p[:, prompt_len + (num_block + 1) * self.block_size:] = -np.inf
+                x0 = torch.where(mask_index, x0, xt)
+                confidence = torch.where(mask_index, x0_p, -np.inf)
+                transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+                for j in range(confidence.shape[0]):
+                    _, select_index = torch.topk(confidence[j], k=int(self.mask_length / self.sampling_steps))
+                    transfer_index[j, select_index] = True
+                xt[transfer_index] = x0[transfer_index]
+            if torch.sum(xt == self.tokenizer.eos_token_id) > 0:
+                return xt
+        return xt
+    @torch.no_grad()
+    def llada_remdm_sample(self, prompt):
+        xt = torch.full((1, prompt.shape[1] + self.mask_length), self.mask_id, dtype=torch.long).to(self.model.device)
+        xt[:, :prompt.shape[1]] = prompt.clone()
+        prompt_index = (xt != self.mask_id)
+        prompt_len = prompt_index.sum(1).item()
+        assert self.mask_length % self.block_size == 0
+        num_blocks = self.mask_length // self.block_size
+        assert self.sampling_steps % num_blocks == 0
+        steps = self.sampling_steps // num_blocks
+        assert self.mask_length % self.sampling_steps == 0
+        for num_block in range(num_blocks):
+            conf_cache = torch.ones_like(xt, dtype=torch.float64) * np.inf
+            remask_thres = int(self.block_size / 8 * 7)
+            for i in range(2 * steps):
+                if i >= remask_thres and i < remask_thres + steps:
+                    remask_index = torch.zeros_like(xt, dtype=torch.bool, device=xt.device)
+                    _, mask_indices = torch.topk(conf_cache, k=self.remdm_number, largest=False, dim=1)
+                    remask_index[0, mask_indices] = True
+                    conf_cache[remask_index] = np.inf
+                    xt[remask_index] = self.mask_id
+                mask_index = (xt == self.mask_id)
+                logits = self.model(xt).logits
+                p = F.softmax(logits.to(torch.float64), dim=-1)
+                x0 = _sample_categorical(p)
+                x0_p = torch.squeeze(torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
+                x0_p[:, prompt_len + (num_block + 1) * self.block_size:] = -np.inf
+                x0 = torch.where(mask_index, x0, xt)
+                confidence = torch.where(mask_index, x0_p, -np.inf)
+                if i >= remask_thres and i < remask_thres + steps:
+                    transfer_length = self.remdm_number
+                else:
+                    transfer_length = int(self.mask_length / self.sampling_steps)
+                transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+                for j in range(confidence.shape[0]):
+                    _, select_index = torch.topk(confidence[j], k=transfer_length)
+                    transfer_index[j, select_index] = True
+                xt[transfer_index] = x0[transfer_index]
+                conf_cache[transfer_index] = confidence[transfer_index]
+            if torch.sum(xt == self.tokenizer.eos_token_id) > 0:
+                return xt
+        return xt
+    @torch.no_grad()
+    def generate_until(self, requests: list[Instance]):
+        start_time = time.time()
+        def _tokenize(e):
+            return {
+                "question": self.tokenizer(e["question"])["input_ids"],
+                "question_text": e["question"],
+                "until": e["until"],
+            }
+        ds = [{"question": req.args[0], "until": req.args[1]['until']} for req in requests]
+        ds = Dataset.from_list(ds)
+        ds = ds.map(_tokenize)
+        ds = ds.with_format("torch")
+        out, out_for_json = [], []
+        for elem in tqdm(ds, desc="Generating..."):
+            prompt = elem["question"].unsqueeze(0).to(self.device)
+            stop_tokens = elem["until"] + ["<|eot_id|>", self.tokenizer.eos_token]
+            if self.sampler == 'llada_conf':
+                generated_answer = self.llada_conf_sample(prompt)
+            elif self.sampler == 'llada_remdm':
+                generated_answer = self.llada_remdm_sample(prompt)
+            generated_answer = self.tokenizer.decode(generated_answer[0][prompt.shape[1]:], skip_special_tokens=False)
+            # print(elem['question_text'] + generated_answer)
+            for stop_seq in stop_tokens:
+                if stop_seq in generated_answer:
+                    generated_answer = generated_answer.split(stop_seq)[0]
+            # remove special tokens
+            generated_answer_ids = self.tokenizer(generated_answer)["input_ids"]
+            generated_answer = self.tokenizer.decode(generated_answer_ids, skip_special_tokens=True)
+            # print(elem['question_text'] + generated_answer)
+            out.append(generated_answer)
+            out_for_json.append({
+                "prefix": elem["question_text"],
+                "result": generated_answer,
+            })
+            if self.accelerator is not None:
+                self.accelerator.wait_for_everyone()
+        end_time = time.time()
+        total_duration = end_time - start_time
+        print(f"\n总耗时: {total_duration:.2f} 秒")
+        with open(os.path.join(self.generated_samples_path, str(self._rank) + ".json"), "w") as f:
+            final_output = {
+                "total_time_seconds": total_duration,
+                "samples": out_for_json
+            }
+            json.dump(final_output, f, indent=2)
+        return out
+if __name__ == "__main__":
+    cli_evaluate()

Prism/LLaDA/LLaDA_Truthfulqa/eval_llada_prism.py ADDED Viewed

	@@ -0,0 +1,333 @@

+'''
+This file is inspired by the code from https://github.com/ML-GSAI/SMDM
+And extended with Prism methods for LLaDA-Instruct.
+'''
+import accelerate
+import torch
+import re
+from pathlib import Path
+import random
+import numpy as np
+import torch.nn.functional as F
+from datasets import Dataset
+from lm_eval.__main__ import cli_evaluate
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel
+import json
+import os
+import logging
+import math
+import textwrap
+import time
+from collections import Counter
+logger = logging.getLogger(__name__)
+def set_seed(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def _sample_categorical(categorical_probs):
+    gumbel_norm = (1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log()).to(categorical_probs.dtype)
+    return (categorical_probs / gumbel_norm).argmax(dim=-1)
+class CodeVerifier:
+    def __init__(self, model, tokenizer, device="cuda"):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.yes_ids, self.no_ids = [], []
+        for t in ["Yes", " Yes", "YES"]:
+            ids = self.tokenizer.encode(t, add_special_tokens=False)
+            if ids: self.yes_ids.append(ids[-1])
+        for t in ["No", " No", "NO"]:
+            ids = self.tokenizer.encode(t, add_special_tokens=False)
+            if ids: self.no_ids.append(ids[-1])
+        self.yes_ids = list(set(self.yes_ids))
+        self.no_ids = list(set(self.no_ids))
+    def svf_score(self, prompt, code_str, task_type="code"):
+        max_len = 2000
+        truncated_code = code_str[:max_len]
+        if task_type == "code":
+            prompt_template = f"""
+        You are an expert programming contest judge. Your task is to evaluate a generated solution for a given problem based on correctness, efficiency, and adherence to constraints.
+        [Problem Statement]
+        {prompt}
+        [/Problem Statement]
+        [Proposed Python Solution]
+        ```python
+        {truncated_code}
+        ```
+        [/Proposed Python Solution]
+        **Conclusion**: Based on your analysis, is the solution likely to be fully correct? Answer with a single word: Yes or No.
+        **Answer:** """
+        elif task_type == "math":
+            prompt_template = f"""
+        You are an expert mathematician and competition judge. Your task is to evaluate a proposed mathematical solution for a given problem based on its logical rigor and accuracy.
+        [Math Problem]
+        {prompt}
+        [/Math Problem]
+        [Proposed Mathematical Solution]
+        {truncated_code}
+        [/Proposed Mathematical Solution]
+        **Conclusion**: Based on your analysis, is this solution path sound and likely to result in the correct final answer? Answer with a single word: Yes or No.
+        **Answer:** """
+        elif task_type == "reasoning":
+            prompt_template = f"""
+        You are an expert reading comprehension and faithfulness judge. Your task is to evaluate a generated answer based on the provided context and question.
+        [Context and Question]
+        {prompt}
+        [/Context and Question]
+        [Proposed Answer]
+        {truncated_code}
+        [/Proposed Answer]
+        **Conclusion**: Based on your analysis, is the answer fully faithful to the context and correct? Answer with a single word: Yes or No.
+        **Answer:** """
+        elif task_type == "qa":
+            prompt_template = f"""
+        You are a strict truthfulness judge. Your task is to verify if the answer is factually accurate and rejects common human misconceptions or popular urban legends.
+        [Question]
+        {prompt}
+        [/Question]
+        [Proposed Answer]
+        {truncated_code}
+        [/Proposed Answer]
+        **Conclusion**: Based on these criteria, is this answer fully truthful and accurate? Answer with a single word: Yes or No.
+        **Answer:** """
+        else:
+            prompt_template = f"Is the following answer correct?\nQuestion: {prompt}\nAnswer: {truncated_code}\nAnswer Yes or No.\nAnswer:"
+        verify_text = textwrap.dedent(prompt_template).strip()
+        input_ids = self.tokenizer(verify_text, return_tensors="pt").input_ids.to(self.device)
+        if input_ids.shape[1] > 2048: input_ids = input_ids[:, -2048:]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits[0, -1, :]
+            yes_score = max([logits[i].item() for i in self.yes_ids if i < logits.shape[-1]] + [-100.0])
+            no_score = max([logits[i].item() for i in self.no_ids if i < logits.shape[-1]] + [-100.0])
+            probs = torch.softmax(torch.tensor([yes_score, no_score]), dim=0)
+            return probs[0].item()
+    def get_reward(self, prompt, code_str, mode="confidence", current_logits=None, task_type="code"):
+        if mode == "svf":
+            return self.svf_score(prompt, code_str, task_type=task_type)
+        else:
+            if current_logits is None: return 0.0
+            probs = torch.softmax(current_logits.to(torch.float32), dim=-1)
+            max_probs, _ = torch.max(probs, dim=-1)
+            return torch.exp(torch.mean(torch.log(max_probs + 1e-10))).item()
+class HTSSampler:
+    def __init__(self, model, tokenizer, device="cuda"):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.verifier = CodeVerifier(model, tokenizer, device)
+    def _sample_with_temperature(self, logits, temperature=0.7):
+        logits = logits.to(torch.float32)
+        if temperature > 0:
+            probs = torch.softmax(logits / temperature, dim=-1)
+            x0 = torch.multinomial(probs.view(-1, probs.shape[-1]), 1).view(logits.shape[:-1])
+            x0_p = torch.gather(torch.softmax(logits, dim=-1), -1, x0.unsqueeze(-1)).squeeze(-1)
+        else:
+            x0_p, x0 = torch.max(torch.softmax(logits, dim=-1), dim=-1)
+        return x0, x0_p
+    @torch.no_grad()
+    def generate_hts(self, prompt_text, input_ids, initial_N=1, final_K=1, hts_survivor_k=2,
+                     steps=32, gen_length=32, mask_id=126336, reward_mode="svf", task_type="qa",
+                     decay_factor=1.8, hts_start_pct=0.1, hts_end_pct=0.6, pruning_interval=3):
+        b = initial_N
+        prompt_len = input_ids.shape[1]
+        xt = torch.full((b, prompt_len + gen_length), mask_id, dtype=torch.long, device=self.device)
+        xt[:, :prompt_len] = input_ids.repeat(b, 1)
+        conf_scores = torch.zeros((b, prompt_len + gen_length), device=self.device)
+        ts_start, tr_end = int(steps * hts_start_pct), int(steps * hts_end_pct)
+        schedule = torch.full((steps,), gen_length // steps, dtype=torch.int64, device=self.device)
+        schedule[:gen_length % steps] += 1
+        next_pruning = ts_start
+        for i in range(steps):
+            mask_indices = (xt == mask_id)
+            if not mask_indices.any(): break
+            logits = self.model(xt).logits
+            x0, x0_p = self._sample_with_temperature(logits[:, prompt_len:], temperature=0.7)
+            # Update tokens based on confidence
+            for idx in range(b):
+                curr_mask = mask_indices[idx, prompt_len:]
+                if not curr_mask.any(): continue
+                conf = torch.where(curr_mask, x0_p[idx], -float('inf'))
+                _, sel_idx = torch.topk(conf, k=min(schedule[i].item(), curr_mask.sum().item()))
+                xt[idx, prompt_len + sel_idx] = x0[idx, sel_idx]
+                conf_scores[idx, prompt_len + sel_idx] = x0_p[idx, sel_idx]
+            # Pruning
+            if i >= next_pruning and i < tr_end and b > final_K:
+                target_width = max(final_K, math.ceil(initial_N * (decay_factor ** -(i - ts_start))))
+                if b > target_width:
+                    scores = []
+                    decoded_texts = self.tokenizer.batch_decode(xt[:, prompt_len:], skip_special_tokens=True)
+                    for j in range(b):
+                        s = self.verifier.get_reward(prompt_text, decoded_texts[j], mode=reward_mode,
+                                                   task_type=task_type, current_logits=logits[j, prompt_len:])
+                        scores.append(s)
+                    top_k_indices = torch.topk(torch.tensor(scores), k=min(target_width, b)).indices
+                    xt = xt[top_k_indices]
+                    conf_scores = conf_scores[top_k_indices]
+                    b = xt.shape[0]
+                    next_pruning = i + pruning_interval
+        # Final decoding and ranking
+        final_texts = self.tokenizer.batch_decode(xt[:, prompt_len:], skip_special_tokens=True)
+        results = []
+        for j in range(b):
+            s = self.verifier.get_reward(prompt_text, final_texts[j], mode=reward_mode, task_type=task_type)
+            results.append({'text': final_texts[j], 'score': s})
+        results.sort(key=lambda v: v['score'], reverse=True)
+        return [r['text'] for r in results]
+@register_model("llada_dist")
+class LLaDAEvalHarness(LM):
+    def __init__(self, model_path='', mask_id=126336, max_length=4096, generated_samples_path='',
+                 batch_size=32, sampling_steps=64, mask_length=128, sampler='hts', task_type="qa",
+                 hts_initial_n=8, final_K=1, hts_reward_mode="svf", hts_start_pct=0.1, hts_end_pct=0.6,
+                 **kwargs):
+        super().__init__()
+        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto")
+        self.model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.device = self.model.device
+        self.mask_id = mask_id
+        self.sampling_steps = int(sampling_steps)
+        self.mask_length = int(mask_length)
+        self.sampler = sampler
+        self.task_type = task_type
+        self.generated_samples_path = generated_samples_path
+        self.hts_initial_n = int(hts_initial_n)
+        self.final_K = int(final_K)
+        self.hts_reward_mode = hts_reward_mode
+        self.hts_start_pct = float(hts_start_pct)
+        self.hts_end_pct = float(hts_end_pct)
+        self.hts_sampler = HTSSampler(self.model, self.tokenizer, self.device)
+        self._rank = 0
+    @torch.no_grad()
+    def llada_conf_sample(self, prompt):
+        xt = torch.full((1, prompt.shape[1] + self.mask_length), self.mask_id, dtype=torch.long, device=self.device)
+        xt[:, :prompt.shape[1]] = prompt
+        step_size = self.mask_length // self.sampling_steps
+        for i in range(self.sampling_steps):
+            mask_indices = (xt == self.mask_id)
+            if not mask_indices.any(): break
+            logits = self.model(xt).logits
+            p = F.softmax(logits.to(torch.float64), dim=-1)
+            x0 = _sample_categorical(p)
+            x0_p = torch.squeeze(torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1)
+            confidence = torch.where(mask_indices, x0_p, -float('inf'))
+            _, select_idx = torch.topk(confidence[0], k=min(step_size, mask_indices.sum().item()))
+            xt[0, select_idx] = x0[0, select_idx]
+        return xt
+    @torch.no_grad()
+    def generate_until(self, requests):
+        start_time = time.time()
+        out, out_for_json = [], []
+        for req in tqdm(requests, desc="Generating..."):
+            prompt_text = req.args[0]
+            until = req.args[1]['until']
+            prompt_ids = self.tokenizer(prompt_text, return_tensors="pt").input_ids.to(self.device)
+            if self.sampler == 'hts':
+                candidates = self.hts_sampler.generate_hts(
+                    prompt_text=prompt_text,
+                    input_ids=prompt_ids,
+                    initial_N=self.hts_initial_n,
+                    final_K=self.final_K,
+                    steps=self.sampling_steps,
+                    gen_length=self.mask_length,
+                    reward_mode=self.hts_reward_mode,
+                    task_type=self.task_type,
+                    hts_start_pct=self.hts_start_pct,
+                    hts_end_pct=self.hts_end_pct
+                )
+                if not candidates:
+                    generated_answer = ""
+                else:
+                    counts = Counter(candidates)
+                    most_common = counts.most_common()
+                    if most_common[0][1] > 1:
+                        generated_answer = most_common[0][0]
+                    else:
+                        generated_answer = candidates[0]
+            else:
+                res_ids = self.llada_conf_sample(prompt_ids)
+                generated_answer = self.tokenizer.decode(res_ids[0, prompt_ids.shape[1]:], skip_special_tokens=True)
+            for stop_seq in until + ["<|eot_id|>", self.tokenizer.eos_token]:
+                if stop_seq and stop_seq in generated_answer:
+                    generated_answer = generated_answer.split(stop_seq)[0]
+            generated_answer = generated_answer.strip()
+            out.append(generated_answer)
+            out_for_json.append({"prefix": prompt_text, "result": generated_answer})
+        end_time = time.time()
+        total_duration = end_time - start_time
+        if self.generated_samples_path:
+            os.makedirs(self.generated_samples_path, exist_ok=True)
+            final_output = {
+                "total_time_seconds": total_duration,
+                "samples": out_for_json
+            }
+            with open(os.path.join(self.generated_samples_path, "res.json"), "w") as f:
+                json.dump(final_output, f, indent=2)
+        return out
+    def loglikelihood(self, requests): return []
+    def loglikelihood_rolling(self, requests): return []
+    @property
+    def rank(self): return 0
+    @property
+    def world_size(self): return 1
+if __name__ == "__main__":
+    cli_evaluate()

Prism/README.md ADDED Viewed

	@@ -0,0 +1,107 @@

+# Prism: Efficient Test-Time Scaling via Hierarchical Search and Self-Verification for Discrete Diffusion Language Models
+## PRISM: Pruning, Remasking, and Integrated Self-verification Method
+PRISM is an efficient inference framework designed for **Discrete Diffusion Language Models (dLLMs)**, focusing on a favorable performance-efficiency trade-off by matching Best-of-N performance with substantially fewer Function Evaluations (NFE).
+[![arXiv](https://img.shields.io/badge/arXiv-2602.01842-b31b1b.svg)](https://arxiv.org/abs/2602.01842)
+[![GitHub](https://img.shields.io/badge/GitHub-Repo-181717?logo=github)](https://github.com/viiika/Prism)
+### Method
+![Prism Method](method.png)
+### Experiments
+![Prism Exp](exp.png)
+### Project Structure
+```text
+PRISM/
+├── Dream/                   # Experiments for Dream
+│   ├── Dream_Baseline/      # Standard baseline sampling (N=1)
+│   └── Dream_Prism/         # Prism implementation
+├── LLaDA/                   # Experiments for LLaDA 8B Instruct
+│   ├── LLaDA_Baseline/      # Standard baseline sampling (N=1)
+│   ├── LLaDA_Prism/         # PRISM implementation
+│   └── LLaDA_Truthfulqa/    # TruthfulQA evaluation
+└── LLaDA2mini/              # Experiments for LLaDA 2.0-mini
+    ├── LLaDA2mini_Baseline/ # Standard baseline sampling (N=1)
+    └── LLaDA2mini_Prism/    # Prism implementation
+```
+### Prerequisites
+```bash
+cd PRISM
+```
+For Dream Project:
+```bash
+cd Dream/Dream_Prism/eval_instruct
+pip install -e .
+```
+For LLaDA_Truthfulqa:
+```bash
+cd LLaDA/LLaDA_Truthfulqa/lm-evaluation-harness
+pip install -e .
+```
+For LLaDA and LLaDA2 Projects:
+```bash
+cd LLaDA/LLaDA_Prism
+pip install -r requirements.txt
+```
+#### Quick Start
+Evaluate Dream
+```bash
+cd Dream/Dream_Prism
+bash scripts/run_gsm8k.sh
+bash scripts/run_humaneval.sh
+bash scripts/run_math500.sh
+bash scripts/run_mbpp.sh
+```
+Evaluate LLaDA 8B Instruct
+```bash
+cd LLaDA/LLaDA_Prism
+bash scripts/run_gsm8k.sh
+bash scripts/run_humaneval.sh
+bash scripts/run_math500.sh
+bash scripts/run_mbpp.sh
+```
+Evaluate LLaDA 8B Instruct(Truthfulqa)
+```bash
+cd LLaDA/LLaDA_Truthfulqa
+bash scripts/llada_prism.sh
+```
+Evaluate LLaDA 2.0-mini
+```bash
+cd LLaDA2mini/LLaDA2mini_Prism
+bash scripts/run_gsm8k.sh
+bash scripts/run_humaneval.sh
+bash scripts/run_math500.sh
+bash scripts/run_mbpp.sh
+```
+### Evaluation & Metrics
+Each project folder contains a metrics/ directory used for calculating final accuracy and efficiency metrics.
+Usage Example:
+```bash
+python PRISM/LLaDA/LLaDA_Prism/metrics/gsm8k_all.py
+```
+### Acknowledgements
+This project is built upon [preordinary/LLaDA2](https://github.com/preordinary/LLaDA2), [ML-GSAI/LLaDA](https://github.com/ML-GSAI/LLaDA), [DreamLM/Dream](https://github.com/DreamLM/Dream) and [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). Special thanks to the authors for their contributions.
+### 📚 Citation
+If you find this work helpful, please consider citing:
+```bibtex
+@article{bai2026prism,
+  title={Prism: Efficient Test-Time Scaling via Hierarchical Search and Self-Verification for Discrete Diffusion Language Models},
+  author={Bai, Jinbin and Li, Yixuan and Zhu, Yuchen and Xin, Yi and Shi, Qingyu and Feng, Aosong and Liu, Xiaohong and Tao, Molei and Xue, Jianru and Li, Xiangtai and Yang, Ming-Hsuan},
+  journal={arXiv preprint arXiv:2602.01842},
+  year={2026}
+}
+```

URSA-1.7B/.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+. filter=lfs diff=lfs merge=lfs -text
+tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text

URSA-1.7B/.gitignore ADDED Viewed

	@@ -0,0 +1,55 @@

+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.cuo
+# Compiled Dynamic libraries
+*.so
+*.dll
+*.dylib
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+# Compiled python
+*.pyc
+__pycache__
+# Compiled MATLAB
+*.mex*
+# IPython notebook checkpoints
+.ipynb_checkpoints
+# Editor temporaries
+*.swp
+*~
+# Sublime Text settings
+*.sublime-workspace
+*.sublime-project
+# Eclipse Project settings
+*.*project
+.settings
+# QtCreator files
+*.user
+# VSCode files
+.vscode
+# IDEA files
+.idea
+# OSX dir files
+.DS_Store
+# Android files
+.gradle
+*.iml
+local.properties

URSA-1.7B/LICENSE ADDED Viewed

	@@ -0,0 +1,176 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS

URSA-1.7B/README.md ADDED Viewed

	@@ -0,0 +1,117 @@

+---
+library_name: diffusers
+license: apache-2.0
+license_link: https://huggingface.co/BAAI/URSA-1.7B-FSQ320/blob/main/LICENSE
+pipeline_tag: text-to-video
+base_model:
+- Qwen/Qwen3-1.7B
+---
+# URSA-1.7B-FSQ320 Model Card
+## Model Details
+- **Developed by:** BAAI
+- **Model type:** Text-to-Video Generation Model
+- **Model size:** 1.7B
+- **Model precision:** torch.float16 (FP16)
+- **Model resolution:** 512x320
+- **Model paper:** [Uniform Discrete Diffusion with Metric Path for Video Generation](https://arxiv.org/abs/2510.24717)
+- **Model family:** [BAAI-Vision-URSA](https://github.com/baaivision/URSA)
+- **Model Tokenizer:** [Cosmos-Tokenize1-DV4x8x8-360p](https://huggingface.co/nvidia/Cosmos-Tokenize1-DV4x8x8-360p)
+- **Model Description:** This is a model that can be used to generate and modify videos based on text prompts.
+## Examples
+Using the [🤗's Diffusers library](https://github.com/huggingface/diffusers) to run URSA in a simple and efficient manner.
+```bash
+pip install diffusers transformers accelerate imageio[ffmpeg]
+pip install git+ssh://git@github.com/baaivision/URSA.git
+```
+Running the pipeline:
+```python
+import os, torch, numpy
+from diffnext.pipelines import URSAPipeline
+from diffnext.utils import export_to_video
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+model_id, height, width = "BAAI/URSA-1.7B-FSQ320", 320, 512
+model_args = {"torch_dtype": torch.float16, "trust_remote_code": True}
+pipe = URSAPipeline.from_pretrained(model_id, **model_args)
+pipe = pipe.to(torch.device("cuda"))
+text_prompt = "a lone grizzly bear walks through a misty forest at dawn, sunlight catching its fur."
+negative_prompt = "worst quality, low quality, inconsistent motion, static, still, blurry, jittery, distorted, ugly"
+# Text-to-Image
+prompt = text_prompt
+num_frames, num_inference_steps = 1, 25
+image = pipe(**locals()).frames[0]
+image.save("ursa.jpg")
+# Image-to-Video
+prompt = f"motion=9.0, {text_prompt}"
+num_frames, num_inference_steps = 49, 50
+video = pipe(**locals()).frames[0]
+export_to_video(video, "ursa_1+48f.mp4", fps=12)
+# Text-to-Video
+image, video = None, None
+prompt = f"motion=9.0, {text_prompt}"
+num_frames, num_inference_steps = 49, 50
+video = pipe(**locals()).frames[0]
+export_to_video(video, "ursa_49f.mp4", fps=12)
+# Video-to-Video
+prompt = f"motion=5.0, {text_prompt}"
+num_frames, num_inference_steps = 49, 50
+num_cond_frames, cond_noise_scale = 13, 0.1
+for i in range(12):
+    video, start_video = video[-num_cond_frames:], video
+    video = pipe(**locals()).frames[0]
+    video = numpy.concatenate([start_video, video[num_cond_frames:]])
+    export_to_video(video, "ursa_{}f.mp4".format(video.shape[0]), fps=12)
+```
+# Uses
+## Direct Use
+The model is intended for research purposes only. Possible research areas and tasks include
+- Research on generative models.
+- Applications in educational or creative tools.
+- Generation of artworks and use in design and other artistic processes.
+- Probing and understanding the limitations and biases of generative models.
+- Safe deployment of models which have the potential to generate harmful content.
+Excluded uses are described below.
+#### Out-of-Scope Use
+The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model.
+#### Misuse and Malicious Use
+Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to:
+- Mis- and disinformation.
+- Representations of egregious violence and gore.
+- Impersonating individuals without their consent.
+- Sexual content without consent of the people who might see it.
+- Sharing of copyrighted or licensed material in violation of its terms of use.
+- Intentionally promoting or propagating discriminatory content or harmful stereotypes.
+- Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use.
+- Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc.
+## Limitations and Bias
+### Limitations
+- The autoencoding part of the model is lossy.
+- The model cannot render complex legible text.
+- The model does not achieve perfect photorealism.
+- The fingers, .etc in general may not be generated properly.
+- The model was trained on a subset of the web datasets [LAION-5B](https://laion.ai/blog/laion-5b/) and [COYO-700M](https://github.com/kakaobrain/coyo-dataset), which contains adult, violent and sexual content.
+### Bias
+While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.

URSA-1.7B/model_index.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_class_name": "URSAPipeline",
+  "tokenizer": [
+    "transformers",
+    "Qwen2TokenizerFast"
+  ],
+  "scheduler": [
+    "__scheduler__",
+    "KineticOptimalScheduler"
+  ],
+  "transformer": [
+    "__transformer__",
+    "URSATransformer3DModel"
+  ],
+  "vae": [
+    "__vae__",
+    "AutoencoderVQCosmos3D"
+  ]
+}

URSA/.flake8 ADDED Viewed

	@@ -0,0 +1,21 @@

+[flake8]
+max-line-length = 100
+ignore =
+    # whitespace before ':' (conflicted with Black)
+    E203,
+    # ambiguous variable name
+    E741,
+    # ‘from module import *’ used; unable to detect undefined names
+    F403,
+    # name may be undefined, or defined from star imports: module
+    F405,
+    # redefinition of unused name from line N
+    F811,
+    # undefined name
+    F821,
+    # line break before binary operator
+    W503,
+    # line break after binary operator
+    W504
+# module imported but unused
+per-file-ignores = __init__.py: F401

URSA/.gitignore ADDED Viewed

	@@ -0,0 +1,55 @@

+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.cuo
+# Compiled Dynamic libraries
+*.so
+*.dll
+*.dylib
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+# Compiled python
+*.pyc
+__pycache__
+# Compiled MATLAB
+*.mex*
+# IPython notebook checkpoints
+.ipynb_checkpoints
+# Editor temporaries
+*.swp
+*~
+# Sublime Text settings
+*.sublime-workspace
+*.sublime-project
+# Eclipse Project settings
+*.*project
+.settings
+# QtCreator files
+*.user
+# VSCode files
+.vscode
+# IDEA files
+.idea
+# OSX dir files
+.DS_Store
+# Android files
+.gradle
+*.iml
+local.properties

URSA/=4.57.1 ADDED Viewed

	@@ -0,0 +1,70 @@

+Requirement already satisfied: diffusers in /usr/local/lib/python3.12/dist-packages (0.36.0)
+Requirement already satisfied: transformers in /usr/local/lib/python3.12/dist-packages (5.2.0)
+Requirement already satisfied: accelerate in /usr/local/lib/python3.12/dist-packages (1.12.0)
+Requirement already satisfied: imageio in /usr/local/lib/python3.12/dist-packages (2.37.2)
+Requirement already satisfied: imageio-ffmpeg in /usr/local/lib/python3.12/dist-packages (0.6.0)
+Requirement already satisfied: omegaconf in /usr/local/lib/python3.12/dist-packages (2.3.0)
+Requirement already satisfied: wandb in /usr/local/lib/python3.12/dist-packages (0.25.0)
+Requirement already satisfied: importlib_metadata in /usr/local/lib/python3.12/dist-packages/setuptools/_vendor (from diffusers) (8.0.0)
+Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from diffusers) (3.17.0)
+Requirement already satisfied: httpx<1.0.0 in /usr/local/lib/python3.12/dist-packages (from diffusers) (0.28.1)
+Requirement already satisfied: huggingface-hub<2.0,>=0.34.0 in /usr/local/lib/python3.12/dist-packages (from diffusers) (1.3.0)
+Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from diffusers) (1.26.4)
+Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.12/dist-packages (from diffusers) (2024.11.6)
+Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from diffusers) (2.32.3)
+Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.12/dist-packages (from diffusers) (0.5.3)
+Requirement already satisfied: Pillow in /usr/local/lib/python3.12/dist-packages (from diffusers) (11.1.0)
+Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (23.2)
+Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from transformers) (6.0.2)
+Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.22.2)
+Requirement already satisfied: typer-slim in /usr/local/lib/python3.12/dist-packages (from transformers) (0.21.2)
+Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.12/dist-packages (from transformers) (4.67.1)
+Requirement already satisfied: psutil in /usr/local/lib/python3.12/dist-packages (from accelerate) (7.0.0)
+Requirement already satisfied: torch>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from accelerate) (2.9.0+cu128)
+Requirement already satisfied: antlr4-python3-runtime==4.9.* in /usr/local/lib/python3.12/dist-packages (from omegaconf) (4.9.3)
+Requirement already satisfied: click>=8.0.1 in /usr/local/lib/python3.12/dist-packages (from wandb) (8.1.8)
+Requirement already satisfied: gitpython!=3.1.29,>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from wandb) (3.1.46)
+Requirement already satisfied: platformdirs in /usr/local/lib/python3.12/dist-packages (from wandb) (4.3.6)
+Requirement already satisfied: protobuf!=4.21.0,!=5.28.0,<7,>=3.19.0 in /usr/local/lib/python3.12/dist-packages (from wandb) (4.24.4)
+Requirement already satisfied: pydantic<3 in /usr/local/lib/python3.12/dist-packages (from wandb) (2.10.6)
+Requirement already satisfied: sentry-sdk>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from wandb) (2.54.0)
+Requirement already satisfied: typing-extensions<5,>=4.8 in /usr/local/lib/python3.12/dist-packages (from wandb) (4.12.2)
+Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.12/dist-packages (from gitpython!=3.1.29,>=1.0.0->wandb) (4.0.12)
+Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx<1.0.0->diffusers) (4.8.0)
+Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx<1.0.0->diffusers) (2025.1.31)
+Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1.0.0->diffusers) (1.0.7)
+Requirement already satisfied: idna in /usr/local/lib/python3.12/dist-packages (from httpx<1.0.0->diffusers) (3.10)
+Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1.0.0->diffusers) (0.14.0)
+Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.34.0->diffusers) (2025.2.0)
+Requirement already satisfied: hf-xet<2.0.0,>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.34.0->diffusers) (1.3.2)
+Requirement already satisfied: shellingham in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=0.34.0->diffusers) (1.5.4)
+Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3->wandb) (0.7.0)
+Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3->wandb) (2.27.2)
+Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->diffusers) (3.4.1)
+Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->diffusers) (2.0.7)
+Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (75.8.2)
+Requirement already satisfied: sympy>=1.13.3 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (1.14.0)
+Requirement already satisfied: networkx>=2.5.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (3.4.2)
+Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (3.1.6)
+Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.8.93 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (12.8.93)
+Requirement already satisfied: nvidia-cuda-runtime-cu12==12.8.90 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (12.8.90)
+Requirement already satisfied: nvidia-cuda-cupti-cu12==12.8.90 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (12.8.90)
+Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (9.10.2.21)
+Requirement already satisfied: nvidia-cublas-cu12==12.8.4.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (12.8.4.1)
+Requirement already satisfied: nvidia-cufft-cu12==11.3.3.83 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (11.3.3.83)
+Requirement already satisfied: nvidia-curand-cu12==10.3.9.90 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (10.3.9.90)
+Requirement already satisfied: nvidia-cusolver-cu12==11.7.3.90 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (11.7.3.90)
+Requirement already satisfied: nvidia-cusparse-cu12==12.5.8.93 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (12.5.8.93)
+Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (0.7.1)
+Requirement already satisfied: nvidia-nccl-cu12==2.27.5 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (2.27.5)
+Requirement already satisfied: nvidia-nvshmem-cu12==3.3.20 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (3.3.20)
+Requirement already satisfied: nvidia-nvtx-cu12==12.8.90 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (12.8.90)
+Requirement already satisfied: nvidia-nvjitlink-cu12==12.8.93 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (12.8.93)
+Requirement already satisfied: nvidia-cufile-cu12==1.13.1.3 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (1.13.1.3)
+Requirement already satisfied: triton==3.5.0 in /usr/local/lib/python3.12/dist-packages (from torch>=2.0.0->accelerate) (3.5.0)
+Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.12/dist-packages/setuptools/_vendor (from importlib_metadata->diffusers) (3.19.2)
+Requirement already satisfied: annotated-doc>=0.0.2 in /usr/local/lib/python3.12/dist-packages (from typer-slim->transformers) (0.0.4)
+Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.12/dist-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb) (5.0.2)
+Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy>=1.13.3->torch>=2.0.0->accelerate) (1.3.0)
+Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.12/dist-packages (from anyio->httpx<1.0.0->diffusers) (1.3.1)
+Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch>=2.0.0->accelerate) (3.0.2)

URSA/LICENSE ADDED Viewed

	@@ -0,0 +1,176 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS

URSA/README.md ADDED Viewed

	@@ -0,0 +1,191 @@

+<div align="center">
+<img src="assets/logo.png" width="30%" alt="logo"/>
+<h1>🐻 URSA: Uniform Discrete Diffusion with Metric Path<br>for Video Generation</h1>
+<p align="center">
+<a href="https://arxiv.org/abs/2510.24717"><img src="https://img.shields.io/badge/ArXiv-2510.24717-%23840707.svg" alt="ArXiv"></a>
+<a href="https://huggingface.co/collections/BAAI/ursa"><img src="https://img.shields.io/badge/🤗 Weights-BAAI/URSA-rgb(166,109,59).svg" alt=""></a>
+<a href="https://huggingface.co/spaces/BAAI/nova-d48w1024-osp480"><img src="https://img.shields.io/badge/🤗 Demo-TI2V-%26840707.svg" alt="TI2VDemo"></a>
+<a href="http://bitterdhg.github.io/URSA_page"><img src="https://img.shields.io/badge/Project-URSA-%237CB4F7.svg" alt="Project"></a>
+</p>
+<p align="center">
+[Haoge Deng](https://scholar.google.com/citations?user=S2sbvjgAAAAJ&hl)<sup>1,4*</sup>, [Ting Pan](https://scholar.google.com/citations?&user=qQv6YbsAAAAJ)<sup>2,4*</sup>, [Fan Zhang](https://scholar.google.com/citations?user=VsJ39HMAAAAJ)<sup>4*</sup>, [Yang Liu](https://scholar.google.com/citations?user=9JcQ2hwAAAAJ&hl)<sup>3,4*</sup>, [Zhuoyan Luo](https://scholar.google.com/citations?user=mKQhEsIAAAAJ&hl)<sup>4</sup>, [Yufeng Cui](https://scholar.google.com/citations?user=5Ydha2EAAAAJ&hl)<sup>4</sup>, [Wenxuan Wang](https://scholar.google.com/citations?user=75OyC-oAAAAJ&hl)<sup>4</sup><br>
+[Chunhua Shen](https://scholar.google.com/citations?user=Ljk2BvIAAAAJ&hl)<sup>3</sup>, [Shiguang Shan](https://scholar.google.com/citations?user=Vkzd7MIAAAAJ&hl)<sup>2</sup>, [Zhaoxiang Zhang](https://scholar.google.com/citations?user=qxWfV6cAAAAJ&hl)<sup>1†</sup>, [Xinlong Wang](https://scholar.google.com/citations?user=DPz0DjYAAAAJ&hl)<sup>4†</sup><br>
+[CASIA](http://english.ia.cas.cn)<sup>1</sup>, [CASICT](http://english.ict.cas.cn)<sup>2</sup>, [ZJU](https://www.zju.edu.cn/english)<sup>3</sup>, [BAAI](https://www.baai.ac.cn/en)<sup>4</sup><br>
+<sup>*</sup> Equal Contribution, <sup>†</sup> Corresponding Author
+<br><br><image src="assets/model_preview.gif"/>
+<br><br><image src="assets/model_overview.png"/>
+</div>
+We present **URSA** (**U**niform disc**R**ete diffu**S**ion with metric p**A**th), a simple yet powerful framework that bridges the gap with continuous approaches. **URSA** formulates the video generation task as an iterative global refinement of discrete spatiotemporal tokens and scales efficiently to long video generation, requiring fewer inference steps. **URSA** enables multi-task video generation with asynchronous timestep scheduling strategy in one unified model.
+## 🚀 News
+- ```[Feb 2026]``` Accepted by ICLR 2026 [[OpenReview]](https://openreview.net/forum?id=GFU5yCbILk).
+- ```[Jan 2026]``` Released [Training Guide](./docs/training.md).
+- ```[Oct 2025]``` 🎉 URSA is part of [Emu3.5](https://github.com/baaivision/Emu3.5) as DiDA (Discrete Diffusion Adaptation)!
+- ```[Oct 2025]``` Released <a href="https://huggingface.co/spaces/BAAI/nova-d48w1024-osp480"><b>TI2V</b></a> 🤗 Demo.
+- ```[Oct 2025]``` Released [Paper](https://arxiv.org/abs/2510.24717) & [Project Page](http://bitterdhg.github.io/URSA_page) & [Evaluation Guide](./docs/evaluation.md).
+## ✨Hightlights
+- 🥇 **Novel Approach**: Uniform Discrete Diffusion with Metric Path.
+- 🥈 **SOTA Performance**: High efficiency with state-of-the-art T2I/T2V/I2V results.
+- 🥉 **Unified Modeling**: Multi-task capabilities in a single unified model.
+## 🗄️ Models
+### 🖼️ Text to Image
+| Model | Resolution | Data | Weight | GenEval | DPGBench |
+|:-----:|:----------:|:----:|:------:|:-------:|:--------:|
+| URSA-0.6B-IBQ1024 | 1024x1024 | 30M | [🤗 HF](https://huggingface.co/BAAI/URSA-0.6B-IBQ1024) \| [🤖 ModelScope](https://www.modelscope.cn/models/BAAI/URSA-0.6B-IBQ1024) | 0.79 | 85.6 |
+| URSA-1.7B-IBQ1024 | 1024x1024 | 30M | [🤗 HF](https://huggingface.co/BAAI/URSA-1.7B-IBQ1024) \| [🤖 ModelScope](https://www.modelscope.cn/models/BAAI/URSA-1.7B-IBQ1024) | 0.80 | 86.0 |
+### 🎬 Text to Video
+| Model | Resolution | Data | Weight | VBench-T2V | VBench-I2V |
+|:-----:|:----------:|:----:|:------:|:----------:|:----------:|
+| URSA-0.6B-FSQ320 | 49x512x320 | 24M | [🤗 HF](https://huggingface.co/BAAI/URSA-0.6B-FSQ320) \| [🤖 ModelScope](https://www.modelscope.cn/models/BAAI/URSA-0.6B-FSQ320) | 81.4 | 86.0 |
+| URSA-1.7B-FSQ320 | 49x512x320 | 24M | [🤗 HF](https://huggingface.co/BAAI/URSA-1.7B-FSQ320) \| [🤖 ModelScope](https://www.modelscope.cn/models/BAAI/URSA-1.7B-FSQ320) | 82.4 | 86.2 |
+## 📖 Table of Contents
+- [🔧 Installation](#installation)
+- [🔥 Quick Start](#quick-start)
+  - [🖼️ Image Generation](#quickstart-image-generation)
+  - [🎬 Video Generation](#quickstart-video-generation)
+- [💻 Gradio Demo](#gradio-demo)
+- [💯 Evaluation](./docs/evaluation.md)
+- [🤖 Training](./docs/training.md)
+## 🔧 Installation
+<a id="installation"></a>
+Clone this repository to local disk and install:
+```bash
+pip install diffusers transformers>=4.57.1 accelerate imageio imageio-ffmpeg omegaconf wandb
+git clone https://github.com/baaivision/URSA.git
+cd URSA && pip install .
+```
+## 🔥 Quick Start
+<a id="quick-start"></a>
+### 🖼️ Image Generation
+<a id="quickstart-image-generation"></a>
+```python
+import torch
+from diffnext.pipelines import URSAPipeline
+model_id, height, width = "BAAI/URSA-1.7B-IBQ1024", 1024, 1024
+model_args = {"torch_dtype": torch.float16, "trust_remote_code": True}
+pipe = URSAPipeline.from_pretrained(model_id, **model_args)
+pipe = pipe.to(torch.device("cuda"))
+prompt = "The bear, calm and still, gazes upward as if lost in contemplation of the cosmos."
+negative_prompt = "worst quality, low quality, inconsistent motion, static, still, blurry, jittery, distorted, ugly"
+image = pipe(**locals()).frames[0]
+image.save("ursa.jpg")
+```
+### 🎬 Video Generation
+<a id="quickstart-video-generation"></a>
+```python
+import os, torch, numpy
+from diffnext.pipelines import URSAPipeline
+from diffnext.utils import export_to_video
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+model_id, height, width = "BAAI/URSA-1.7B-FSQ320", 320, 512
+model_args = {"torch_dtype": torch.float16, "trust_remote_code": True}
+pipe = URSAPipeline.from_pretrained(model_id, **model_args)
+pipe = pipe.to(torch.device("cuda"))
+text_prompt = "a lone grizzly bear walks through a misty forest at dawn, sunlight catching its fur."
+negative_prompt = "worst quality, low quality, inconsistent motion, static, still, blurry, jittery, distorted, ugly"
+# Text-to-Image
+prompt = text_prompt
+num_frames, num_inference_steps = 1, 25
+image = pipe(**locals()).frames[0]
+image.save("ursa.jpg")
+# Image-to-Video
+prompt = f"motion=9.0, {text_prompt}"
+num_frames, num_inference_steps = 49, 50
+video = pipe(**locals()).frames[0]
+export_to_video(video, "ursa_1+48f.mp4", fps=12)
+# Text-to-Video
+image, video = None, None
+prompt = f"motion=9.0, {text_prompt}"
+num_frames, num_inference_steps = 49, 50
+video = pipe(**locals()).frames[0]
+export_to_video(video, "ursa_49f.mp4", fps=12)
+# Video-to-Video
+prompt = f"motion=5.0, {text_prompt}"
+num_frames, num_inference_steps = 49, 50
+num_cond_frames, cond_noise_scale = 13, 0.1
+for i in range(12):
+    video, start_video = video[-num_cond_frames:], video
+    video = pipe(**locals()).frames[0]
+    video = numpy.concatenate([start_video, video[num_cond_frames:]])
+    export_to_video(video, "ursa_{}f.mp4".format(video.shape[0]), fps=12)
+```
+## 💻 Gradio Demo
+<a id="gradio-demo"></a>
+```bash
+# Text-to-Image (T2I)
+python scripts/app_ursa_t2i.py --model "BAAI/URSA-1.7B-IBQ1024" --device 0
+# Text-to-Image-to-Video (TI2V)
+python scripts/app_ursa_ti2v.py --model "BAAI/URSA-1.7B-FSQ320" --device 0
+```
+## 📋 Todo List
+- [X] [Model Zoo](#model-zoo)
+- [X] [Quick Start](#quick-start)
+- [X] [Gradio Demo](#gradio-demo)
+- [X] [Evaluation Guide](./docs/evaluation.md)
+- [X] [Training Guide](./docs/training.md)
+- [ ] 4B Model
+## 📖 Citation
+If you find this repository useful, please consider giving a star ⭐ and citation 🦖:
+```
+@article{deng2025ursa,
+  title={Uniform Discrete Diffusion with Metric Path for Video Generation},
+  author={Deng, Haoge and Pan, Ting and Zhang, Fan and Liu, Yang and Luo, Zhuoyan and Cui, Yufeng and Shen, Chunhua and Shan, Shiguang and Zhang, Zhaoxiang and Wang, Xinlong},
+  journal={arXiv preprint arXiv:2510.24717},
+  year={2025}
+}
+```
+```
+@article{deng2024nova,
+  title={Autoregressive Video Generation without Vector Quantization},
+  author={Deng, Haoge and Pan, Ting and Diao, Haiwen and Luo, Zhengxiong and Cui, Yufeng and Lu, Huchuan and Shan, Shiguang and Qi, Yonggang and Wang, Xinlong},
+  journal={arXiv preprint arXiv:2412.14169},
+  year={2024}
+}
+```
+## 🤗 Acknowledgement
+We thank the repositories:
+- [NOVA](https://github.com/baaivision/NOVA). ✨NOVA is the predecessor of 🐻URSA.
+- [FlowMatching](https://github.com/facebookresearch/flow_matching). This codebase systemically provides CFM and DFM implementations.
+- [FUDOKI](https://github.com/fudoki-hku/FUDOKI). This codebase provides a naive multimodal DFM implementation.
+- [CodeWithGPU](https://github.com/seetacloud/codewithgpu). CodeWithGPU library is the core of our data loading pipeline.
+## License
+Code and models are licensed under [Apache License 2.0](LICENSE).

URSA/inference.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os, torch, numpy
+from diffnext.pipelines import URSAPipeline
+from diffnext.utils import export_to_video
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+model_id, height, width = "BAAI/URSA-1.7B-FSQ320", 320, 512
+model_args = {"torch_dtype": torch.bfloat16, "trust_remote_code": True}
+pipe = URSAPipeline.from_pretrained(model_id, **model_args)
+pipe = pipe.to(torch.device("cuda"))
+text_prompt = "tom and jerry"#"a lone grizzly bear walks through a misty forest at dawn, sunlight catching its fur."
+negative_prompt = "worst quality, low quality, inconsistent motion, static, still, blurry, jittery, distorted, ugly"
+import time
+t1 = time.time()
+# Text-to-Image
+prompt = text_prompt
+num_frames, num_inference_steps = 1, 25
+image = pipe(**locals()).frames[0]
+image.save("tom/ursa.jpg")
+t2 = time.time()
+# Image-to-Video
+prompt = f"motion=9.0, {text_prompt}"
+num_frames, num_inference_steps = 49, 50
+video = pipe(**locals()).frames[0]
+export_to_video(video, "tom/ursa_1+48f.mp4", fps=12)
+t3 = time.time()
+# Text-to-Video
+image, video = None, None
+prompt = f"motion=9.0, {text_prompt}"
+num_frames, num_inference_steps = 49, 50
+video = pipe(**locals()).frames[0]
+export_to_video(video, "tom/ursa_49f.mp4", fps=12)
+t4 = time.time()
+# Video-to-Video
+prompt = f"motion=5.0, {text_prompt}"
+num_frames, num_inference_steps = 49, 50
+num_cond_frames, cond_noise_scale = 13, 0.1
+for i in range(12):
+    video, start_video = video[-num_cond_frames:], video
+    video = pipe(**locals()).frames[0]
+    video = numpy.concatenate([start_video, video[num_cond_frames:]])
+    export_to_video(video, "tom/ursa_{}f.mp4".format(video.shape[0]), fps=12)
+t5 = time.time()
+print(f"Text-to-Image time: {t2-t1:.2f} seconds")
+print(f"Image-to-Video time: {t3-t2:.2f} seconds")
+print(f"Text-to-Video time: {t4-t3:.2f} seconds")
+print(f"Video-to-Video time: {t5-t4:.2f} seconds")
+# Single H800 GPU, batch_size=1, the inference time is:
+# Text-to-Image time: 5.05 seconds
+# Image-to-Video time: 101.92 seconds
+# Text-to-Video time: 101.52 seconds
+# Video-to-Video time: 1226.25 seconds
+# cd URSA/
+# source .venv_ursa/bin/activate
+# accelerate launch --config_file accelerate_configs/deepspeed_zero2.yaml     --machine_rank 0 --num_machines 1 --num_processes 8     scripts/train_distill_dimo.py     config="./configs/distill_dimo.yaml"     experiment.output_dir="./experiments/distill_dimo_v3"     distill.teacher_ckpt="/gfs/space/private/fengzl/World_Model/URSA-1.7B"     distill.prompt_source="/gfs/space/private/fengzl/World_Model/Koala-36M-v1"

URSA/pyproject.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[tool.black]
+line-length = 100
+target-version = ['py310']

URSA/requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch
+diffusers
+transformers>=4.57.1
+accelerate
+imageio
+imageio-ffmpeg
+omegaconf
+wandb
+scipy
+codewithgpu

URSA/setup.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# ------------------------------------------------------------------------
+# Copyright (c) 2024-present, BAAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------
+"""Python setup script."""
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+import setuptools
+import setuptools.command.build_py
+import setuptools.command.install
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--version", default=None)
+    args, unknown = parser.parse_known_args()
+    sys.argv = [sys.argv[0]] + unknown
+    args.git_version = None
+    args.long_description = ""
+    if args.version is None and os.path.exists("version.txt"):
+        with open("version.txt", "r") as f:
+            args.version = f.read().strip()
+    if os.path.exists(".git"):
+        try:
+            git_version = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd="./")
+            args.git_version = git_version.decode("ascii").strip()
+        except (OSError, subprocess.CalledProcessError):
+            pass
+    if os.path.exists("README.md"):
+        with open(os.path.join("README.md"), encoding="utf-8") as f:
+            args.long_description = f.read()
+    return args
+def clean_builds():
+    for path in ["build", "diffnext.egg-info"]:
+        if os.path.exists(path):
+            shutil.rmtree(path)
+def find_packages(top):
+    """Return the python sources installed to package."""
+    packages = []
+    for root, _, _ in os.walk(top):
+        if os.path.exists(os.path.join(root, "__init__.py")):
+            packages.append(root)
+    return packages
+def find_package_data():
+    """Return the external data installed to package."""
+    return []
+class BuildPyCommand(setuptools.command.build_py.build_py):
+    """Enhanced 'build_py' command."""
+    def build_packages(self):
+        with open("diffnext/version.py", "w") as f:
+            f.write(
+                'version = "{}"\n'
+                'git_version = "{}"\n'
+                "__version__ = version\n".format(args.version, args.git_version)
+            )
+        super(BuildPyCommand, self).build_packages()
+    def build_package_data(self):
+        self.package_data = {"diffnext": find_package_data()}
+        super(BuildPyCommand, self).build_package_data()
+class InstallCommand(setuptools.command.install.install):
+    """Enhanced 'install' command."""
+    def initialize_options(self):
+        super(InstallCommand, self).initialize_options()
+        self.old_and_unmanageable = True
+args = parse_args()
+setuptools.setup(
+    name="diffnext",
+    version=args.version,
+    description="A diffusers based library for autoregressive diffusion models.",
+    long_description=args.long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/baaivision/URSA",
+    author="BAAI",
+    license="Apache License",
+    packages=find_packages("diffnext"),
+    cmdclass={"build_py": BuildPyCommand, "install": InstallCommand},
+    install_requires=[
+        "torch",
+        "diffusers",
+        "transformers",
+        "accelerate",
+        "imageio",
+        "imageio-ffmpeg",
+        "omegaconf",
+        "wandb",
+        "scipy",
+    ],
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3 :: Only",
+        "Topic :: Scientific/Engineering",
+        "Topic :: Scientific/Engineering :: Mathematics",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+)
+clean_builds()

URSA/ursa.jpg ADDED Viewed

URSA/version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.3.0a0